Beispiel #1
0
	def _join_handler(self):

		set_thread_title()

		session = self._create_session()

        # TODO make it more robust to exceptions
		with self._lock:
			while self._running:

				#self._log.debug("Checking for the state of jobs ...")

				for job in session.query(SagaJob).filter(
								SagaJob.saga_state != None,
								~SagaJob.state.in_(runstates.TERMINAL_STATES)):

					#self._log.debug("Checking state for [{}] {} ... ".format(job.id, job.name))

					try:
						if job.saga_getjob_start is None:
							job.saga_getjob_start = time.time()
							session.commit()

						#self._qlock.release()
						saga_job = self._job_service.get_job(job.saga_id)
						#self._qlock.acquire()

						job.saga_state = saga_job.state

						next_state = self.__JOB_STATE_FROM_SAGA[job.saga_state]
						if job.state == next_state:
							continue

						if job.saga_state in [saga.job.DONE, saga.job.FAILED, saga.job.CANCELED]:
							job.exitcode = saga_job.exit_code
							job.created = datetime_from_posix(saga_job.created)
							job.started = datetime_from_posix(saga_job.started)
							job.finished = datetime_from_posix(saga_job.finished)
							job.hostname = saga_job.execution_hosts

					except saga.NoSuccess:
						# wait up to N seconds before failing
						if time.time() - job.saga_getjob_start >= 120:
							self._log.error("Couldn't reconnect to job {}".format(job.saga_id))
							# what should we do ? failing or finishing ?
							job.state = runstates.FAILED # let's be paranoid !
							session.commit()

							self._lock.release()
							self._fire_job_updated(job)
							self._lock.acquire()
						continue

					if job.saga_state == saga.job.DONE and job.exitcode != 0:
						next_state = runstates.FAILED

					if job.state != runstates.WAITING and next_state == runstates.WAITING:
						self._queued_count += 1
					elif job.state == runstates.WAITING and next_state != runstates.WAITING:
						self._queued_count -= 1

					job.state = next_state

					session.commit()

					self._lock.release()
					self._fire_job_updated(job)
					self._lock.acquire()

					if not self._running:
						break

				if self._running:
					self._lock.release()
					time.sleep(self._state_check_interval)
					self._lock.acquire()

		session.close()

		self._log.debug("Join thread finished")
Beispiel #2
0
	def _run_handler(self):

		set_thread_title()

		session = self._create_session()

		with self._lock:
			while self._running:

				if self._max_queued != 0 and self._queued_count >= self._max_queued:
					self._lock.release()
					time.sleep(1)
					self._lock.acquire()
					continue

				job = None

				try:
					#self._log.debug("Looking for waiting jobs ...")
					job = session.query(SagaJob).filter(
							SagaJob.state == runstates.WAITING,
							SagaJob.saga_state == None).order_by(SagaJob.priority).first()

					if job is not None:
						pass #self._queued_count += 1
						#self._log.debug("Found: {}".format(repr(job)))
				except Exception as e:
					self._log.exception(e)

				if job is None:
					self._lock.release()
					time.sleep(1)
					self._lock.acquire()
					continue

				self._lock.release()

				jd = saga.job.Description()
				jd.name = job.name

				jd.executable = job.script
				jd.arguments = []
				jd.environment = job.env

				jd.spmd_variation = self._task_conf(job, "pe", self._pe)
				jd.total_cpu_count = self._task_conf(job, "cpu_count", self._cpu_count)

				jd.queue = self._task_conf(job, "queue", self._queue)
				jd.project = self._task_conf(job, "project", self._project)
				jd.working_directory = self._task_conf(job, "working_directory", self._working_directory)

				jd.output = jd.error = os.path.join(self._remote_output_path, job.case_name, "{}.txt".format(job.name))

				# remove previous output file
				try:
					self._log.debug("Removing remote output for [{}] {} ...".format(job.id, job.name))
					self._remote_dir.remove(jd.output)
				except saga.DoesNotExist:
					pass
				except Exception as ex:
					self._log.error("Couldn't remove remote job output file: {}".format(ex))

				self._lock.acquire()

				# create and run the job
				try:
					self._log.debug("Running job [{}] {} ...".format(job.id, job.name))
					saga_job = None
					saga_job = self._job_service.create_job(jd)
					saga_job.run()

					job.saga_id = saga_job.id
					job.saga_state = saga_job.state
					job.output = jd.output
					session.commit()

					self._queued_count += 1

				except Exception as ex:
					self._log.exception(ex)
					if saga_job is not None: # and saga_job.state in [saga.job.RUNNING, saga.job.SUSPENDED]:
						try:
							saga_job.cancel()
						finally:
							session.rollback()

		session.close()

		self._log.debug("Run thread finished")
Beispiel #3
0
	def _join(self):
		"Joiner thread"

		set_thread_title()

		self._num_alive_threads += 1

		_log = logger.get_logger("wok.engine.join")

		_log.debug("Engine join thread ready")

		session = None

		num_exc = 0

		while self._running:
			try:
				set_thread_title("waiting")

				job_info = self.__queue_adaptative_get(self._join_queue)
				if job_info is None:
					continue

				workitem_id, job_id = job_info

				with self._lock:
					session = db.Session()

					workitem = session.query(db.WorkItem).filter(db.WorkItem.id == workitem_id).one()

					case = self._cases_by_name[workitem.case.name]
					task = case.component(workitem.task.cname)

					set_thread_title(task.cname)

					#_log.debug("Joining work-item %s ..." % task.cname)

					jr = task.platform.jobs.join(job_id)

					wr = Data.element(dict(
							hostname=jr.hostname,
							created=jr.created.strftime(_DT_FORMAT) if jr.created is not None else None,
							started=jr.started.strftime(_DT_FORMAT) if jr.started is not None else None,
							finished=jr.finished.strftime(_DT_FORMAT) if jr.finished is not None else None,
							exitcode=jr.exitcode.code if jr.exitcode is not None else None))

					r = task.platform.data.load_workitem_result(case.name, task.cname, workitem.index)

					if r is not None:
						if r.exception is not None:
							wr["exception"] = r.exception
						if r.trace is not None:
							wr["trace"] = r.trace

					workitem.substate = None
					workitem.result = wr

					case.num_active_workitems -= 1

					session.commit()

					# check if there are still more work-items
					num_workitems = session.query(func.count(db.WorkItem.id)).filter(
						~db.WorkItem.state.in_(runstates.TERMINAL_STATES)).scalar()

					if self._single_run and num_workitems == 0:
						stop_engine = True
						for case in self._cases:
							stop_engine = stop_engine and (case.state in runstates.TERMINAL_STATES)
						#self._running = not stop_engine
						if stop_engine:
							self._finished_event.set()

					_log.debug("[{}] Joined work-item {}".format(case.name, workitem.cname))

					# check stopping instances
					if case in self._stopping_cases:
						job_ids = self._stopping_cases[case]
						if job_id in job_ids:
							job_ids.remove(job_id)

						if len(job_ids) == 0:
							del self._stopping_cases[case]
							if case.state == runstates.ABORTING:
								workitem.case.state = case.state = runstates.ABORTED

							session.commit()

							if case.removed:
								self.__remove_case(session, case)
								session.commit()
						else:
							_log.debug("Still waiting for {} jobs to stop".format(len(job_ids)))

					if case.state in runstates.TERMINAL_STATES and case.num_active_workitems == 0:
						_log.info("[{}] Case {}. Total time: {}".format(case.name, case.state.title, str(case.elapsed)))

						self._lock.release()
						try:
							self.case_finished.send(case)
						finally:
							self._lock.acquire()

			except BaseException as ex:
				num_exc += 1
				_log.warn("Exception in join thread ({}): {}".format(num_exc, str(ex)))
				from traceback import format_exc
				_log.debug(format_exc())

				try:
					if session is not None:
						session.rollback()
				except Exception as ex:
					_log.warn("Session rollback failed")
					_log.exception(ex)

			finally:
				try:
					if session is not None:
						session.close()
				except Exception as ex:
					_log.warn("Session close failed")
					_log.exception(ex)

		self._num_alive_threads -= 1

		_log.debug("Engine join thread finished")
Beispiel #4
0
	def _logs(self, index):
		"Log retrieval thread"

		set_thread_title()

		self._num_alive_threads += 1

		_log = logger.get_logger("wok.engine.logs-{}".format(index))
		
		_log.debug("Engine logs thread ready")

		num_exc = 0

		while self._running:
			set_thread_title("waiting")

			# get the next task to retrieve the logs
			job_info = self.__queue_adaptative_get(self._logs_queue)
			if job_info is None:
				continue

			workitem_id, job_id = job_info

			session = db.Session()

			task = None
			try:
				workitem = session.query(db.WorkItem).filter(db.WorkItem.id == workitem_id).one()

				case = self._cases_by_name[workitem.case.name]
				task = case.component(workitem.task.cname)

				set_thread_title(workitem.cname)

				_log.debug("[{}] Reading logs for work-item {} ...".format(case.name, workitem.cname))

				output = task.platform.jobs.output(job_id)
				if output is None:
					output = StringIO.StringIO()

				path = os.path.join(self._work_path, "logs", case.name, task.cname)
				if not os.path.isdir(path):
					try:
						os.makedirs(path)
					except:
						if not os.path.isdir(path):
							raise

				path = os.path.join(path, "{:08}.db".format(workitem.index))
				if os.path.isfile(path):
					os.remove(path)

				logs_db = LogsDb(path)
				logs_db.open()
				logs_db.add(case.name, task.cname, workitem.index, output)
				logs_db.close()

				_log.debug("[{}] Done with logs of work-item {}".format(case.name, workitem.cname))

			except BaseException as ex:
				num_exc += 1
				session.rollback()
				_log.info("Exception in logs thread ({}): {}".format(num_exc, str(ex)))
				from traceback import format_exc
				_log.debug(format_exc())

			finally:
				workitem.substate = runstates.JOINING
				self._join_queue.put(job_info)
				session.commit()
				session.close()

		self._num_alive_threads -= 1

		_log.debug("Engine logs thread finished")
Beispiel #5
0
	def _run(self):

		set_thread_title()

		num_exc = 0

		self._running = True

		self._num_alive_threads += 1

		# Start the logs threads

		for i in range(self._num_log_threads):
			t = threading.Thread(target=self._logs, args=(i, ), name="wok-engine-logs-%d" % i)
			self._logs_threads += [t]
			t.start()

		# Start the join thread

		self._join_thread = threading.Thread(target=self._join, name="wok-engine-join")
		self._join_thread.start()

		_log = logger.get_logger("wok.engine.run")

		_log.debug("Engine run thread ready")

		while self._running:

			session = db.Session()

			try:
				#_log.debug("Scheduling new tasks ...")
				set_thread_title("scheduling")

				updated_tasks = set()

				# schedule tasks ready to be executed and save new workitems into the db
				for case in self._cases:
					tasks = case.schedule(session)
					updated_tasks.update(tasks)
					session.commit()

				# submit workitems ready to be executed
				for platform in self._platforms:
					job_submissions = self.__job_submissions(session, platform)
					for js, job_id, job_state in platform.submit(job_submissions):
						workitem = session.query(db.WorkItem).filter(db.WorkItem.id == js.workitem_id).one()
						workitem.job_id = job_id
						workitem.state = job_state
						js.task.dirty = True
						session.commit()
						updated_tasks.add(js.task)

				session.close()
				session = None

				#_log.debug("Waiting for events ...")

				set_thread_title("waiting")

				while len(updated_tasks) == 0 and not self._notified and self._running:
					self._cvar.wait(1)
				self._notified = False

				if not self._running:
					break

				session = db.Session() # there is a session.close() in the finished block

				#_log.debug("Stopping jobs for aborting instances ...")

				set_thread_title("working")

				# check stopping instances
				for case in self._cases:
					if (case.state == runstates.ABORTING or case.removed) and case not in self._stopping_cases:
						num_job_ids = session.query(db.WorkItem.job_id).filter(db.WorkItem.case_id == case.id)\
											.filter(~db.WorkItem.state.in_(runstates.TERMINAL_STATES)).count()
						if num_job_ids == 0:
							if case.state == runstates.ABORTING:
								_log.debug("Aborted case {} with no running jobs".format(case.name))
								dbcase = session.query(db.Case).filter(db.Case.id == case.id)
								dbcase.state = case.state = runstates.ABORTED
								session.commit()
							else:
								_log.debug("Stopped case {} with no running jobs".format(case.name))

							if case.removed:
								_log.debug("Removing case {} with no running jobs".format(case.name))
								self.__remove_case(session, case)
								session.commit()
						else:
							_log.info("Stopping {} jobs for case {} ...".format(num_job_ids, case.name))

							self._stopping_cases[case] = set()
							for platform in self._platforms:
								job_ids = [int(r[0]) for r in session.query(db.WorkItem.job_id)
															.filter(db.WorkItem.case_id == case.id)\
															.filter(db.WorkItem.platform == platform.name)\
															.filter(~db.WorkItem.state.in_(runstates.TERMINAL_STATES))]

								self._stopping_cases[case].update(job_ids)

								platform.jobs.abort(job_ids)

				#_log.debug("Checking job state changes ...")

				# detect workitems which state has changed
				for platform in self._platforms:
					for job_id, state in platform.jobs.state():
						try:
							workitem = session.query(db.WorkItem).filter(db.WorkItem.job_id == job_id).one()
						except NoResultFound:
							_log.warn("No work-item available for the job {0} while retrieving state".format(job_id))
							platform.jobs.abort([job_id])
							platform.jobs.join(job_id)
							continue

						if workitem.state != state:
							case = self._cases_by_name[workitem.case.name]
							task = case.component(workitem.task.cname)
							task.dirty = True

							workitem.state = state
							workitem.substate = runstates.LOGS_RETRIEVAL
							session.commit()
							updated_tasks.add(task)

							# if workitem has finished, queue it for logs retrieval
							if state in runstates.TERMINAL_STATES:
								self._logs_queue.put((workitem.id, job_id))

							_log.debug("[{}] Work-Item {} changed state to {}".format(case.name, workitem.cname, state))

				#_log.debug("Updating components state ...")

				# update affected components state
				updated_cases = set([task.case for task in updated_tasks])
				for case in updated_cases:
					case.update_states(session)
					case.update_count_by_state(session)
					case.clean_components(session)
					session.commit()

					if case.state == runstates.RUNNING:
						self._lock.release()
						try:
							self.case_started.send(case)
						finally:
							self._lock.acquire()

				for task in updated_tasks:
					case = task.case
					#_log.debug("[{}] Component {} updated state to {} ...".format(
					#				component.case.name, component.cname, component.state))

					count = task.workitem_count_by_state
					sb = ["[{}] {} ({})".format(case.name, task.cname, task.state.title)]
					sep = " "
					for state in runstates.STATES:
						if state in count:
							sb += [sep, "{}={}".format(state.symbol, count[state])]
							if sep == " ":
								sep = ", "

					if task.state == runstates.FINISHED and task.state in count:
						elapsed = str(task.elapsed)
						elapsed = elapsed.split(".")[0]
						sb += [" ", "<{}>".format(elapsed)]

					self._log.info("".join(sb))

			except BaseException as ex:
				num_exc += 1
				_log.warn("Exception in run thread ({}): {}".format(num_exc, str(ex)))
				#if num_exc > 3:
				#	raise
				#else:
				from traceback import format_exc
				_log.debug(format_exc())

				try:
					if session is not None:
						session.rollback()
				except Exception as ex:
					_log.warn("Session rollback failed")
					_log.exception(ex)

			finally:
				try:
					if session is not None:
						session.close()
				except Exception as ex:
					_log.warn("Session close failed")
					_log.exception(ex)

				session = None

		set_thread_title("finishing")

		try:
			# print cases state before leaving the thread
			#for case in self._cases:
			#	_log.debug("Case state:\n" + repr(case))

			for t in self._logs_threads:
				t.join()

			self._lock.release()
			self._join_thread.join()
			self._lock.acquire()

			_log.debug("Engine run thread finished")
		except Exception as ex:
			_log.exception(ex)
		
		self._running = False
		self._num_alive_threads -= 1
Beispiel #6
0
	def _run(self):
		self._log.debug("Run thread ready")

		session = self._create_session()
		try:
			while self._running:
				set_thread_title()

				job = self._next_job(session)
				if job is None:
					break

				set_thread_title("{}".format(job.name))

				self._log.debug("Running task [{}] {} ...".format(job.id, job.name))

				# Prepare execution

				script, env = job.script, job.env

				sb = ["Script:\n", script]
				if len(env) > 0:
					for k, v in env.iteritems():
						sb += ["\n{}={}".format(k, v)]
				self._log.debug("".join(sb))

				o = open(job.output, "w")

				cwd = self._conf.get("working_directory")
				if cwd is not None:
					cwd = os.path.abspath(cwd)

				job.hostname = self.hostname
				job.started = datetime.now()
				self._update_job(session, job)

				# Prepare the script file

				fd, script_path = tempfile.mkstemp(prefix=job.name + "-", suffix=".sh")
				with os.fdopen(fd, "w") as f:
					f.write(script)

				# Run the script

				exception_trace = None
				try:
					process = subprocess.Popen(
										args=["/bin/bash", script_path],
										stdin=None,
										stdout=o,
										stderr=subprocess.STDOUT,
										cwd=cwd,
										env=env,
										preexec_fn=os.setsid)

					set_thread_title("{} PID={}".format(job.name, process.pid))

					session.refresh(job, ["state"])
					timeout = ProgressiveTimeout(0.5, 6.0, 0.5)
					while job.state == runstates.RUNNING and process.poll() is None and not self._kill_threads:
						timeout.sleep()
						session.refresh(job, ["state"])

					job.finished = datetime.now()

					if process.poll() is None:
						self._log.info("Killing job [{}] {} ...".format(job.id, job.name))
						os.killpg(process.pid, signal.SIGTERM)
						timeout = ProgressiveTimeout(0.5, 6.0, 0.5)
						while process.poll() is None:
							timeout.sleep()

						job.state = runstates.ABORTED
						job.exitcode = process.returncode

					elif job.state == runstates.ABORTING:
						job.state = runstates.ABORTED

					else:
						job.state = runstates.FINISHED if process.returncode == 0 else runstates.FAILED
						job.exitcode = process.returncode

				except Exception as e:
					self._log.exception(e)
					job.state = runstates.FAILED
					job.finished = datetime.now()
					job.exitcode = exit_codes.RUN_THREAD_EXCEPTION
				finally:
					try:
						if process.poll() is None:
							self._log.info("Killing job [{}] {} ...".format(job.id, job.name))
							os.killpg(process.pid, signal.SIGKILL)
						process.wait()
					except:
						self._log.exception("Exception while waiting for process {} to finish".format(process.pid))

					o.close()
					if os.path.exists(script_path):
						os.remove(script_path)

				if job.state == runstates.FINISHED:
					self._log.debug("Job finished [{}] {}".format(job.id, job.name))
				elif job.state == runstates.ABORTED:
					self._log.debug("Job aborted [{}] {}".format(job.id, job.name))
				else:
					self._log.debug("Job failed [{}] {}".format(job.id, job.name))

				self._update_job(session, job)

		except:
			self._log.exception("Unexpected exception in thread {}".format(current_thread().name))
		finally:
			session.close()

		self._log.debug("Run thread finished")