def test_finish_job_failed(self): """! @brief Test that the adapter skips failed jobs. """ self.create_adapter() resource = mock.MagicMock() job = self.create_job(resource) job.set_status(eva.job.FAILED) self.adapter.finish_job(job)
def sync(self, job): """! @brief Poll Grid Engine for job completion. """ # Create SSH connection try: self.ensure_ssh_connection(job) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Poll for job completion check_command = self.create_qacct_command(job.pid) try: job.logger.debug('Running: %s', check_command) exit_code, stdout, stderr = self.execute_ssh_command(check_command) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) if exit_code != EXIT_OK: job.logger.debug('Job %d has not completed yet.', job.pid) job.set_next_poll_time(QACCT_CHECK_INTERVAL_MSECS) return False job.exit_code = get_exit_code_from_qacct_output(stdout) # Submit job metrics stats = parse_qacct_metrics(stdout.splitlines()) for metric, value in stats['metrics'].items(): self.statsd.timing(metric, value, stats['tags']) # Retrieve stdout and stderr try: with self.sftp_client.open(job.stdout_path, 'r') as f: job.stdout = eva.executor.strip_stdout_newlines(f.readlines()) with self.sftp_client.open(job.stderr_path, 'r') as f: job.stderr = eva.executor.strip_stdout_newlines(f.readlines()) except SSH_RETRY_EXCEPTIONS + (IOError,) as e: raise eva.exceptions.RetryException( 'Unable to retrieve stdout and stderr from finished Grid Engine job.' ) # Set job exit status if job.exit_code == EXIT_OK: job.set_status(eva.job.COMPLETE) else: job.set_status(eva.job.FAILED) # Print stdout and stderr eva.executor.log_stdout_stderr(job, job.stdout, job.stderr) # Remove job script, stdout, and stderr caches try: self.sftp_client.unlink(job.submit_script_path) self.sftp_client.unlink(job.stdout_path) self.sftp_client.unlink(job.stderr_path) except SSH_RETRY_EXCEPTIONS + (IOError,) as e: job.logger.warning('Could not remove script file, stdout and stderr')
def test_finish_job_ignore(self): """! @brief Test that the job is created correctly. """ self.create_adapter() resource = mock.MagicMock() resource.url = 'file:///path/to/foo.bar' job = self.create_job(resource) job.set_status(eva.job.FAILED) self.adapter.finish_job(job)
def test_generate_resources(self): """! @brief Test that the adapter generates correct resources for the job output. """ self.create_adapter() resource = mock.MagicMock() resource.url = 'file:///path/to/foo.bar' job = self.create_job(resource) job.set_status(eva.job.COMPLETE) resources = self.adapter.default_resource_dictionary() self.adapter.generate_resources(job, resources) self.assertEqual(len(resources['productinstance']), 0) self.assertEqual(len(resources['data']), 0) self.assertEqual(len(resources['datainstance']), 1) self.assertEqual(resources['datainstance'][0].args[0]['url'], 'http://bar/baz/foo.bar')
def test_finish_job_and_generate_resources(self): """! @brief Test that job finish works and doesn't throw any exceptions. """ self.create_adapter() resource = mock.MagicMock() with httmock.HTTMock(*eva.tests.schemas.SCHEMAS): job = self.create_job(resource) job.set_status(eva.job.COMPLETE) self.adapter.finish_job(job) resources = self.generate_resources(job) self.assertEqual(len(resources["productinstance"]), 1) self.assertEqual(len(resources["data"]), 1) self.assertEqual(len(resources["datainstance"]), 1) self.assertEqual(resources["datainstance"][0].args[0]["servicebackend"], self.adapter.output_service_backend)
def test_finish_job_and_generate_resources(self): """! @brief Test that job finish works and doesn't throw any exceptions, and that generated resources has their deleted flag set to True. """ self.create_adapter() resource = mock.MagicMock() self.adapter.api = mock.MagicMock() with httmock.HTTMock(*eva.tests.schemas.SCHEMAS): job = self.create_job(resource) job.set_status(eva.job.COMPLETE) self.adapter.finish_job(job) job.instance_list = [mock.MagicMock(), mock.MagicMock()] with httmock.HTTMock(*eva.tests.schemas.SCHEMAS): resources = self.generate_resources(job) self.assertTrue(resources['datainstance'][0].deleted) self.assertTrue(resources['datainstance'][1].deleted)
def test_finish_job_and_generate_resources(self): """! @brief Test that job finish works and doesn't throw any exceptions. """ self.create_adapter() resource = mock.MagicMock() resource.url = 'file:///foo/bar/baz' resource.data.productinstance.reference_time = eva.coerce_to_utc(datetime.datetime(2016, 1, 1, 18, 15, 0)) with httmock.HTTMock(*eva.tests.schemas.SCHEMAS): job = self.create_job(resource) job.set_status(eva.job.COMPLETE) self.adapter.finish_job(job) resources = self.generate_resources(job) self.assertEqual(len(resources['productinstance']), 1) self.assertEqual(len(resources['data']), 1) self.assertEqual(len(resources['datainstance']), 1) self.assertEqual(resources['datainstance'][0].url, 'file:///out/20160101T181500Z')
def test_generate_resources(self): """! @brief Test that the adapter skips failed jobs. """ self.create_adapter() resource = mock.MagicMock() job = self.create_job(resource) job.set_status(eva.job.COMPLETE) self.adapter.finish_job(job) md5sum = '401b30e3b8b5d629635a5c613cdb7919' job.stdout = md5sum resources = self.generate_resources(job) self.assertEqual(resources['datainstance'][0].hash_type, str('md5')) self.assertEqual(resources['datainstance'][0].hash, md5sum) self.assertEqual(len(resources['productinstance']), 0) self.assertEqual(len(resources['data']), 0) self.assertEqual(len(resources['datainstance']), 1)
def test_finish_job_and_generate_resources(self): """! @brief Test that job finish works and doesn't throw any exceptions. """ del self.env['EVA_PRODUCTSTATUS_API_KEY'] self.create_adapter() resource = mock.MagicMock() resource.url = 'file:///foo/bar/baz' self.adapter.api = mock.MagicMock() job = self.create_job(resource) job.service_backend = 'foo' job.set_status(eva.job.COMPLETE) self.adapter.finish_job(job) resources = self.generate_resources(job) self.assertEqual(len(resources['productinstance']), 0) self.assertEqual(len(resources['data']), 0) self.assertEqual(len(resources['datainstance']), 1) self.assertEqual(resources['datainstance'][0].url, 'file:///foo/baz')
def test_generate_resources(self): self.create_adapter() self.adapter.output_product = mock.MagicMock() self.adapter.output_data_format = mock.MagicMock() self.adapter.output_service_backend = mock.MagicMock() self.adapter.nml_data_format = mock.MagicMock() resource = mock.MagicMock() job = self.create_job(resource) job.stdout = [ '/tmp/meteo20160606_00.nc time = "2016-06-06 12", "2016-06-06 15", "2016-06-06 18", "2016-06-06 21", "2016-06-07" ;', '/tmp/meteo20160606_00.nml', ] job.output_files = self.adapter.parse_file_recognition_output(job.stdout) job.resource.data.productinstance.reference_time = eva.coerce_to_utc(datetime.datetime(2016, 6, 6, 12)) job.set_status(eva.job.COMPLETE) self.adapter.finish_job(job) with httmock.HTTMock(*eva.tests.schemas.SCHEMAS): resources = self.generate_resources(job) self.assertEqual(resources['productinstance'][0].args[0]['product'], self.adapter.output_product) self.assertEqual(resources['datainstance'][0].url, 'file:///tmp/meteo20160606_00.nc') self.assertEqual(resources['datainstance'][0].format, self.adapter.output_data_format) self.assertEqual(resources['data'][0].args[0]['time_period_begin'], eva.coerce_to_utc(datetime.datetime(2016, 6, 6, 12))) self.assertEqual(resources['data'][0].args[0]['time_period_end'], eva.coerce_to_utc(datetime.datetime(2016, 6, 7))) self.assertEqual(resources['datainstance'][1].url, 'file:///tmp/meteo20160606_00.nml') self.assertEqual(resources['datainstance'][1].format, self.adapter.nml_data_format) self.assertEqual(resources['data'][1].args[0]['time_period_begin'], None) self.assertEqual(resources['data'][1].args[0]['time_period_end'], None) self.assertEqual(len(resources['productinstance']), 1) self.assertEqual(len(resources['data']), 2) self.assertEqual(len(resources['datainstance']), 2)
def sync(self, job): job.set_status(eva.job.COMPLETE) job.exit_code = 0 job.stdout = [] job.stderr = []
def execute_async(self, job): job.logger.info("Faking job execution and setting exit code to zero.") job.set_status(eva.job.INITIALIZED) job.set_status(eva.job.STARTED)
def execute_async(self, job): """! @brief Execute a job on Grid Engine. """ skip_submit = False # Create SSH connection try: self.ensure_ssh_connection(job) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Check whether a GridEngine task is already running for this job. If # it is, we skip submitting the job and jump right to the qacct polling. job.logger.info('Querying if job is already running.') job_id = create_job_unique_id(self.group_id, job.id) command = 'qstat -j %s' % job_id try: exit_code, stdout, stderr = self.execute_ssh_command(command) if exit_code == 0: job.pid = get_job_id_from_qstat_output(stdout) job.logger.warning('Job is already running with JOB_ID %d, will not submit a new job.', job.pid) job.set_status(eva.job.STARTED) skip_submit = True else: job.logger.info('Job is not running, continuing with submission.') except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Generate paths job.stdout_path = self.create_job_filename(job_id, 'stdout') job.stderr_path = self.create_job_filename(job_id, 'stderr') job.submit_script_path = self.create_job_filename(job_id, 'sh') # Skip submitting the job if it already exists if not skip_submit: # Create a submit script try: with self.sftp_client.open(job.submit_script_path, 'w') as submit_script: script_content = job.command submit_script.write(script_content) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Print the job script to the log eva.executor.log_job_script(job) # Submit the job using qsub command = ['qsub', '-N', job_id, '-b', 'n', '-sync', 'n', '-o', job.stdout_path, '-e', job.stderr_path, ] # Run jobs in a specified queue if self.env['EVA_GRIDENGINE_QUEUE']: command += ['-q', self.env['EVA_GRIDENGINE_QUEUE']] command += [job.submit_script_path] command = ' '.join(command) job.logger.info('Submitting job to GridEngine: %s', command) # Execute command asynchronously try: exit_code, stdout, stderr = self.execute_ssh_command(command) if exit_code != EXIT_OK: raise eva.exceptions.RetryException( 'Failed to submit the job to GridEngine, exit code %d' % exit_code ) job.pid = get_job_id_from_qsub_output(eva.executor.get_std_lines(stdout)[0]) job.logger.info('Job has been submitted, JOB_ID = %d', job.pid) job.set_status(eva.job.STARTED) job.set_next_poll_time(QACCT_CHECK_INTERVAL_MSECS) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e)