def stop_operation(operation_id): # TODO: Review this implementation after DAINT maintenance operation = dao.get_operation_by_id(operation_id) if not operation or operation.has_finished: LOGGER.warning("Operation already stopped: %s" % operation_id) return True LOGGER.debug("Stopping HPC operation: %s" % str(operation_id)) op_ident = OperationDAO().get_operation_process_for_operation( operation_id) if op_ident is not None: # TODO: Handle login transport = unicore_client.Transport( os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]) # Abort HPC job job = Job(transport, op_ident.job_id) if job.is_running(): job.abort() # Kill thread operation_thread = get_op_thread(operation_id) if operation_thread is None: LOGGER.warning("Thread for operation {} is not available".format( operation_id)) else: operation_thread.stop() while not operation_thread.stopped(): LOGGER.info( "Thread for operation {} is stopping".format(operation_id)) BurstService().persist_operation_state(operation, STATUS_CANCELED) return True
def _create_job_with_pyunicore(pyunicore_client, job_description, job_script, inputs): # type: (Client, {}, str, list) -> Job """ Submit and start a batch job on the site, optionally uploading input data files. We took this code from the pyunicore Client.new_job method in order to use our own upload method :return: job """ if len(inputs) > 0 or job_description.get('haveClientStageIn') is True: job_description['haveClientStageIn'] = "true" with closing( pyunicore_client.transport.post( url=pyunicore_client.site_urls['jobs'], json=job_description)) as resp: job_url = resp.headers['Location'] job = Job(pyunicore_client.transport, job_url) if len(inputs) > 0: working_dir = job.working_dir HPCSchedulerClient._upload_file_with_pyunicore( working_dir, job_script, None) for input in inputs: HPCSchedulerClient._upload_file_with_pyunicore( working_dir, input) if job_description.get('haveClientStageIn', None) == "true": try: job.start() except: pass return job
def check_operations_job(): operations = dao.get_operations() if operations is None or len(operations) == 0: return for operation in operations: HPCOperationService.LOGGER.info("Start processing operation {}".format(operation.id)) try: op_ident = dao.get_operation_process_for_operation(operation.id) if op_ident is not None: transport = Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]) job = Job(transport, op_ident.job_id) job_status = job.properties['status'] if job.is_running(): if operation.status == STATUS_PENDING and job_status == HPCJobStatus.READY.value: HPCOperationService._operation_started(operation) HPCOperationService.LOGGER.info( "CSCS job status: {} for operation {}.".format(job_status, operation.id)) return HPCOperationService.LOGGER.info( "Job for operation {} has status {}".format(operation.id, job_status)) if job_status == HPCJobStatus.SUCCESSFUL.value: simulator_gid = operation.view_model_gid HPCOperationService._operation_finished(operation, simulator_gid) else: HPCOperationService._operation_error(operation) except Exception: HPCOperationService.LOGGER.error( "There was an error on background processing process for operation {}".format(operation.id), exc_info=True)
def _operation_finished(operation, simulator_gid): op_ident = dao.get_operation_process_for_operation(operation.id) # TODO: Handle login job = Job( Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]), op_ident.job_id) operation = dao.get_operation_by_id(operation.id) folder = HPCSchedulerClient.storage_interface.get_project_folder( operation.project.name) storage_interface = StorageInterface() if storage_interface.encryption_enabled(): storage_interface.inc_project_usage_count(folder) storage_interface.sync_folders(folder) try: sim_h5_filenames, metric_op, metric_h5_filename = \ HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid) operation.mark_complete(STATUS_FINISHED) dao.store_entity(operation) HPCSchedulerClient().update_db_with_results( operation, sim_h5_filenames, metric_op, metric_h5_filename) except OperationException as exception: HPCOperationService.LOGGER.error(exception) HPCOperationService._operation_error(operation) finally: if storage_interface.encryption_enabled(): storage_interface.sync_folders(folder) storage_interface.set_project_inactive(operation.project)
def _operation_finished(operation, simulator_gid): op_ident = dao.get_operation_process_for_operation(operation.id) # TODO: Handle login job = Job( Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]), op_ident.job_id) sim_h5_filenames, metric_op, metric_h5_filename = \ HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid) operation.mark_complete(STATUS_FINISHED) dao.store_entity(operation) HPCSchedulerClient().update_db_with_results(operation, sim_h5_filenames, metric_op, metric_h5_filename)