Exemple #1
0
    def stop_operation(operation_id):
        # TODO: Review this implementation after DAINT maintenance
        operation = dao.get_operation_by_id(operation_id)
        if not operation or operation.has_finished:
            LOGGER.warning("Operation already stopped: %s" % operation_id)
            return True

        LOGGER.debug("Stopping HPC operation: %s" % str(operation_id))
        op_ident = OperationDAO().get_operation_process_for_operation(
            operation_id)
        if op_ident is not None:
            # TODO: Handle login
            transport = unicore_client.Transport(
                os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY])
            # Abort HPC job
            job = Job(transport, op_ident.job_id)
            if job.is_running():
                job.abort()

        # Kill thread
        operation_thread = get_op_thread(operation_id)
        if operation_thread is None:
            LOGGER.warning("Thread for operation {} is not available".format(
                operation_id))
        else:
            operation_thread.stop()
            while not operation_thread.stopped():
                LOGGER.info(
                    "Thread for operation {} is stopping".format(operation_id))
        BurstService().persist_operation_state(operation, STATUS_CANCELED)
        return True
Exemple #2
0
    def _create_job_with_pyunicore(pyunicore_client, job_description,
                                   job_script, inputs):
        # type: (Client, {}, str, list) -> Job
        """
        Submit and start a batch job on the site, optionally uploading input data files.
        We took this code from the pyunicore Client.new_job method in order to use our own upload method
        :return: job
        """

        if len(inputs) > 0 or job_description.get('haveClientStageIn') is True:
            job_description['haveClientStageIn'] = "true"

        with closing(
                pyunicore_client.transport.post(
                    url=pyunicore_client.site_urls['jobs'],
                    json=job_description)) as resp:
            job_url = resp.headers['Location']

        job = Job(pyunicore_client.transport, job_url)

        if len(inputs) > 0:
            working_dir = job.working_dir
            HPCSchedulerClient._upload_file_with_pyunicore(
                working_dir, job_script, None)
            for input in inputs:
                HPCSchedulerClient._upload_file_with_pyunicore(
                    working_dir, input)
        if job_description.get('haveClientStageIn', None) == "true":
            try:
                job.start()
            except:
                pass

        return job
    def check_operations_job():
        operations = dao.get_operations()
        if operations is None or len(operations) == 0:
            return

        for operation in operations:
            HPCOperationService.LOGGER.info("Start processing operation {}".format(operation.id))
            try:
                op_ident = dao.get_operation_process_for_operation(operation.id)
                if op_ident is not None:
                    transport = Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY])
                    job = Job(transport, op_ident.job_id)
                    job_status = job.properties['status']
                    if job.is_running():
                        if operation.status == STATUS_PENDING and job_status == HPCJobStatus.READY.value:
                            HPCOperationService._operation_started(operation)
                        HPCOperationService.LOGGER.info(
                            "CSCS job status: {} for operation {}.".format(job_status, operation.id))
                        return
                    HPCOperationService.LOGGER.info(
                        "Job for operation {} has status {}".format(operation.id, job_status))
                    if job_status == HPCJobStatus.SUCCESSFUL.value:
                        simulator_gid = operation.view_model_gid
                        HPCOperationService._operation_finished(operation, simulator_gid)
                    else:
                        HPCOperationService._operation_error(operation)
            except Exception:
                HPCOperationService.LOGGER.error(
                    "There was an error on background processing process for operation {}".format(operation.id),
                    exc_info=True)
    def _operation_finished(operation, simulator_gid):
        op_ident = dao.get_operation_process_for_operation(operation.id)
        # TODO: Handle login
        job = Job(
            Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]),
            op_ident.job_id)

        operation = dao.get_operation_by_id(operation.id)
        folder = HPCSchedulerClient.storage_interface.get_project_folder(
            operation.project.name)
        storage_interface = StorageInterface()
        if storage_interface.encryption_enabled():
            storage_interface.inc_project_usage_count(folder)
            storage_interface.sync_folders(folder)

        try:
            sim_h5_filenames, metric_op, metric_h5_filename = \
                HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid)

            operation.mark_complete(STATUS_FINISHED)
            dao.store_entity(operation)
            HPCSchedulerClient().update_db_with_results(
                operation, sim_h5_filenames, metric_op, metric_h5_filename)

        except OperationException as exception:
            HPCOperationService.LOGGER.error(exception)
            HPCOperationService._operation_error(operation)

        finally:
            if storage_interface.encryption_enabled():
                storage_interface.sync_folders(folder)
                storage_interface.set_project_inactive(operation.project)
 def _operation_finished(operation, simulator_gid):
     op_ident = dao.get_operation_process_for_operation(operation.id)
     # TODO: Handle login
     job = Job(
         Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]),
         op_ident.job_id)
     sim_h5_filenames, metric_op, metric_h5_filename = \
         HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid)
     operation.mark_complete(STATUS_FINISHED)
     dao.store_entity(operation)
     HPCSchedulerClient().update_db_with_results(operation,
                                                 sim_h5_filenames,
                                                 metric_op,
                                                 metric_h5_filename)