def cleanup_job_execution(self, job_exe): """See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution` """ logger.info('Cleaning up a non-system job') download_dir = get_job_exe_input_data_dir(job_exe.id) download_work_dir = get_job_exe_input_work_dir(job_exe.id) upload_dir = get_job_exe_output_data_dir(job_exe.id) upload_work_dir = get_job_exe_output_work_dir(job_exe.id) logger.info('Cleaning up download directory') ScaleFile.objects.cleanup_download_dir(download_dir, download_work_dir) logger.info('Cleaning up upload directories') workspace_ids = job_exe.job.get_job_data().get_output_workspace_ids() for workspace in Workspace.objects.filter(id__in=workspace_ids): logger.info('Cleaning up upload directory for workspace %s', workspace.name) ScaleFile.objects.cleanup_upload_dir(upload_dir, upload_work_dir, workspace) move_work_dir = os.path.join(upload_work_dir, 'move_source_file_in_workspace') if os.path.exists(move_work_dir): logger.info( 'Cleaning up work directory for moving parsed source files') ScaleFile.objects.cleanup_move_dir(move_work_dir) logger.info('Deleting %s', move_work_dir) os.rmdir(move_work_dir) delete_normal_job_exe_dir_tree(job_exe.id)
def cleanup_job_execution(self, job_exe): """See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution` """ logger.info('Cleaning up a non-system job') download_dir = get_job_exe_input_data_dir(job_exe.id) download_work_dir = get_job_exe_input_work_dir(job_exe.id) upload_dir = get_job_exe_output_data_dir(job_exe.id) upload_work_dir = get_job_exe_output_work_dir(job_exe.id) logger.info('Cleaning up download directory') ScaleFile.objects.cleanup_download_dir(download_dir, download_work_dir) logger.info('Cleaning up upload directories') workspace_ids = job_exe.job.get_job_data().get_output_workspace_ids() for workspace in Workspace.objects.filter(id__in=workspace_ids): logger.info('Cleaning up upload directory for workspace %s', workspace.name) ScaleFile.objects.cleanup_upload_dir(upload_dir, upload_work_dir, workspace) move_work_dir = os.path.join(upload_work_dir, 'move_source_file_in_workspace') if os.path.exists(move_work_dir): logger.info('Cleaning up work directory for moving parsed source files') ScaleFile.objects.cleanup_move_dir(move_work_dir) logger.info('Deleting %s', move_work_dir) os.rmdir(move_work_dir) delete_normal_job_exe_dir_tree(job_exe.id)
def setup_job_dir(self, data_files, job_exe_id): '''Sets up the directory structure for a job execution and downloads the given files :param data_files: Dict with each file parameter name mapping to a bool indicating if the parameter accepts multiple files (True) and a relative directory path :type data_files: dict of str -> tuple(bool, str) :param job_exe_id: The job execution ID :type job_exe_id: int :returns: Dict with each file parameter name mapping to a list of absolute file paths of the written files :rtype: dict of str -> list of str ''' download_dir = get_job_exe_input_data_dir(job_exe_id) download_work_dir = get_job_exe_input_work_dir(job_exe_id) upload_dir = get_job_exe_output_data_dir(job_exe_id) upload_work_dir = get_job_exe_output_work_dir(job_exe_id) # Download the job execution input files self.retrieve_input_data_files(download_dir, download_work_dir, data_files) # Set up upload directories for output workspace workspace_ids = self.get_output_workspace_ids() for workspace in Workspace.objects.filter(id__in=workspace_ids): ScaleFile.objects.setup_upload_dir(upload_dir, upload_work_dir, workspace)
def setup_job_dir(self, data_files, job_exe_id): """Sets up the directory structure for a job execution and downloads the given files :param data_files: Dict with each file parameter name mapping to a bool indicating if the parameter accepts multiple files (True) and a relative directory path :type data_files: dict of str -> tuple(bool, str) :param job_exe_id: The job execution ID :type job_exe_id: int :returns: Dict with each file parameter name mapping to a list of absolute file paths of the written files :rtype: dict of str -> list of str """ download_dir = get_job_exe_input_data_dir(job_exe_id) download_work_dir = get_job_exe_input_work_dir(job_exe_id) upload_dir = get_job_exe_output_data_dir(job_exe_id) upload_work_dir = get_job_exe_output_work_dir(job_exe_id) # Download the job execution input files self.retrieve_input_data_files(download_dir, download_work_dir, data_files) # Set up upload directories for output workspace workspace_ids = self.get_output_workspace_ids() for workspace in Workspace.objects.filter(id__in=workspace_ids): ScaleFile.objects.setup_upload_dir(upload_dir, upload_work_dir, workspace) # If the upload dir did not get created (e.g. no output files), make sure it gets created for results manifests if not os.path.exists(upload_dir): logger.info("Creating %s", upload_dir) os.makedirs(upload_dir, mode=0755)
def test_file_in_command(self, mock_retrieve_call, mock_os_mkdir, mock_get_one_file, mock_setup_upload): job_exe_id = 1 job_input_dir = file_system.get_job_exe_input_data_dir(job_exe_id) def new_retrieve(arg1, arg2, arg3): return { 'file1_out': [input_file_path], } input_file_path = os.path.join(job_input_dir, 'file1', 'foo.txt') mock_retrieve_call.side_effect = new_retrieve mock_get_one_file.side_effect = lambda (arg1): input_file_path job_interface_dict, job_data_dict, job_environment_dict = self._get_simple_interface_data_env( ) job_interface_dict['command_arguments'] = '${file1}' job_interface_dict['input_data'] = [{ 'name': 'file1', 'type': 'file', 'required': True, }] job_data_dict['input_data'].append({ 'name': 'file1', 'file_id': self.file.id, }) job_data_dict['output_data'].append({ 'name': 'file1_out', 'workspace_id': self.workspace.id, }) job_interface = JobInterface(job_interface_dict) job_data = JobData(job_data_dict) job_environment = JobEnvironment(job_environment_dict) job_interface.perform_pre_steps(job_data, job_environment, job_exe_id) job_command_arguments = job_interface.fully_populate_command_argument( job_data, job_environment, job_exe_id) self.assertEqual(job_command_arguments, input_file_path, 'expected a different command from pre_steps') mock_setup_upload.assert_called_once_with( file_system.get_job_exe_output_data_dir(job_exe_id), file_system.get_job_exe_output_work_dir(job_exe_id), self.workspace)
def cleanup_job_execution(self, job_exe): '''See :meth:`job.execution.job_exe_cleaner.JobExecutionCleaner.cleanup_job_execution` ''' logger.info('Cleaning up a non-system job') download_dir = get_job_exe_input_data_dir(job_exe.id) download_work_dir = get_job_exe_input_work_dir(job_exe.id) upload_dir = get_job_exe_output_data_dir(job_exe.id) upload_work_dir = get_job_exe_output_work_dir(job_exe.id) logger.info('Cleaning up download directory') ScaleFile.objects.cleanup_download_dir(download_dir, download_work_dir) logger.info('Cleaning up upload directories') workspace_ids = job_exe.job.get_job_data().get_output_workspace_ids() for workspace in Workspace.objects.filter(id__in=workspace_ids): logger.info('Cleaning up upload directory for workspace %s', workspace.name) ScaleFile.objects.cleanup_upload_dir(upload_dir, upload_work_dir, workspace) save_job_exe_metrics(job_exe)
def test_file_in_command(self, mock_retrieve_call, mock_os_mkdir, mock_get_one_file, mock_setup_upload): job_exe_id = 1 job_input_dir = file_system.get_job_exe_input_data_dir(job_exe_id) def new_retrieve(arg1, arg2, arg3): return { 'file1_out': [input_file_path], } input_file_path = os.path.join(job_input_dir, 'file1', 'foo.txt') mock_retrieve_call.side_effect = new_retrieve mock_get_one_file.side_effect = lambda(arg1): input_file_path job_interface_dict, job_data_dict, job_environment_dict = self._get_simple_interface_data_env() job_interface_dict['command_arguments'] = '${file1}' job_interface_dict['input_data'] = [{ 'name': 'file1', 'type': 'file', 'required': True, }] job_data_dict['input_data'].append({ 'name': 'file1', 'file_id': self.file.id, }) job_data_dict['output_data'].append({ 'name': 'file1_out', 'workspace_id': self.workspace.id, }) job_interface = JobInterface(job_interface_dict) job_data = JobData(job_data_dict) job_environment = JobEnvironment(job_environment_dict) job_interface.perform_pre_steps(job_data, job_environment, job_exe_id) job_command_arguments = job_interface.fully_populate_command_argument(job_data, job_environment, job_exe_id) self.assertEqual(job_command_arguments, input_file_path, 'expected a different command from pre_steps') mock_setup_upload.assert_called_once_with(file_system.get_job_exe_output_data_dir(job_exe_id), file_system.get_job_exe_output_work_dir(job_exe_id), self.workspace)
def setup_job_dir(self, data_files, job_exe_id): '''Sets up the directory structure for a job execution and downloads the given files :param data_files: Dict with each file parameter name mapping to a bool indicating if the parameter accepts multiple files (True) and a relative directory path :type data_files: dict of str -> tuple(bool, str) :param job_exe_id: The job execution ID :type job_exe_id: int :returns: Dict with each file parameter name mapping to a list of absolute file paths of the written files :rtype: dict of str -> list of str ''' download_dir = get_job_exe_input_data_dir(job_exe_id) download_work_dir = get_job_exe_input_work_dir(job_exe_id) upload_dir = get_job_exe_output_data_dir(job_exe_id) upload_work_dir = get_job_exe_output_work_dir(job_exe_id) # Download the job execution input files self.retrieve_input_data_files(download_dir, download_work_dir, data_files) # Set up upload directories for output workspace workspace_ids = self.get_output_workspace_ids() for workspace in Workspace.objects.filter(id__in=workspace_ids): ScaleFile.objects.setup_upload_dir(upload_dir, upload_work_dir, workspace)
def store_output_data_files(self, data_files, job_exe): '''Stores the given data output files :param data_files: Dict with each file parameter name mapping to a tuple of absolute local file path and media type (media type is optionally None) for a single file parameter and a list of tuples for a multiple file parameter :type data_files: dict of str -> tuple(str, str) or list of tuple(str, str) :param job_exe: The job execution model (with related job and job_type fields) that is storing the output data files :type job_exe: :class:`job.models.JobExecution` :returns: The job results :rtype: :class:`job.configuration.results.job_results.JobResults` ''' upload_dir = get_job_exe_output_data_dir(job_exe.id) work_dir = get_job_exe_output_work_dir(job_exe.id) # Organize the data files workspace_files = { } # Workspace ID -> list of (absolute local file path, media type) params_by_file_path = { } # Absolute local file path -> output parameter name for name in data_files: file_output = self.data_outputs_by_name[name] workspace_id = file_output[u'workspace_id'] if workspace_id in workspace_files: workspace_file_list = workspace_files[workspace_id] else: workspace_file_list = [] workspace_files[workspace_id] = workspace_file_list data_file_entry = data_files[name] if isinstance(data_file_entry, list): for file_tuple in data_file_entry: file_path = os.path.normpath(file_tuple[0]) if not os.path.isfile(file_path): raise Exception('%s is not a valid file' % file_path) params_by_file_path[file_path] = name # Adjust file path to be relative to upload_dir if len(file_tuple) == 2: new_tuple = (os.path.relpath(file_path, upload_dir), file_tuple[1]) else: new_tuple = (os.path.relpath(file_path, upload_dir), file_tuple[1], file_tuple[2]) workspace_file_list.append(new_tuple) else: file_path = os.path.normpath(data_file_entry[0]) if not os.path.isfile(file_path): raise Exception('%s is not a valid file' % file_path) params_by_file_path[file_path] = name # Adjust file path to be relative to upload_dir if len(data_file_entry) == 2: new_tuple = (os.path.relpath(file_path, upload_dir), data_file_entry[1]) else: new_tuple = (os.path.relpath(file_path, upload_dir), data_file_entry[1], data_file_entry[2]) workspace_file_list.append(new_tuple) data_file_store = DATA_FILE_STORE[u'DATA_FILE_STORE'] if not data_file_store: raise Exception(u'No data file store found') stored_files = data_file_store.store_files(upload_dir, work_dir, workspace_files, self.get_input_file_ids(), job_exe) # Organize results param_file_ids = { } # Output parameter name -> file ID or list of file IDs for file_path in stored_files: file_id = stored_files[file_path] name = params_by_file_path[file_path] if isinstance(data_files[name], list): if name in param_file_ids: file_id_list = param_file_ids[name] else: file_id_list = [] param_file_ids[name] = file_id_list file_id_list.append(file_id) else: param_file_ids[name] = file_id # Create job results results = JobResults() for name in param_file_ids: param_entry = param_file_ids[name] if isinstance(param_entry, list): results.add_file_list_parameter(name, param_entry) else: results.add_file_parameter(name, param_entry) return results
def store_output_data_files(self, data_files, job_exe): """Stores the given data output files :param data_files: Dict with each file parameter name mapping to a tuple of absolute local file path and media type (media type is optionally None) for a single file parameter and a list of tuples for a multiple file parameter :type data_files: dict of str -> tuple(str, str) or list of tuple(str, str) :param job_exe: The job execution model (with related job and job_type fields) that is storing the output data files :type job_exe: :class:`job.models.JobExecution` :returns: The job results :rtype: :class:`job.configuration.results.job_results.JobResults` """ upload_dir = get_job_exe_output_data_dir(job_exe.id) work_dir = get_job_exe_output_work_dir(job_exe.id) # Organize the data files workspace_files = {} # Workspace ID -> list of (absolute local file path, media type) params_by_file_path = {} # Absolute local file path -> output parameter name for name in data_files: file_output = self.data_outputs_by_name[name] workspace_id = file_output[u"workspace_id"] if workspace_id in workspace_files: workspace_file_list = workspace_files[workspace_id] else: workspace_file_list = [] workspace_files[workspace_id] = workspace_file_list data_file_entry = data_files[name] if isinstance(data_file_entry, list): for file_tuple in data_file_entry: file_path = os.path.normpath(file_tuple[0]) if not os.path.isfile(file_path): raise Exception("%s is not a valid file" % file_path) params_by_file_path[file_path] = name # Adjust file path to be relative to upload_dir if len(file_tuple) == 2: new_tuple = (os.path.relpath(file_path, upload_dir), file_tuple[1]) else: new_tuple = (os.path.relpath(file_path, upload_dir), file_tuple[1], file_tuple[2]) workspace_file_list.append(new_tuple) else: file_path = os.path.normpath(data_file_entry[0]) if not os.path.isfile(file_path): raise Exception("%s is not a valid file" % file_path) params_by_file_path[file_path] = name # Adjust file path to be relative to upload_dir if len(data_file_entry) == 2: new_tuple = (os.path.relpath(file_path, upload_dir), data_file_entry[1]) else: new_tuple = (os.path.relpath(file_path, upload_dir), data_file_entry[1], data_file_entry[2]) workspace_file_list.append(new_tuple) data_file_store = DATA_FILE_STORE[u"DATA_FILE_STORE"] if not data_file_store: raise Exception(u"No data file store found") stored_files = data_file_store.store_files( upload_dir, work_dir, workspace_files, self.get_input_file_ids(), job_exe ) # Organize results param_file_ids = {} # Output parameter name -> file ID or list of file IDs for file_path in stored_files: file_id = stored_files[file_path] name = params_by_file_path[file_path] if isinstance(data_files[name], list): if name in param_file_ids: file_id_list = param_file_ids[name] else: file_id_list = [] param_file_ids[name] = file_id_list file_id_list.append(file_id) else: param_file_ids[name] = file_id # Create job results results = JobResults() for name in param_file_ids: param_entry = param_file_ids[name] if isinstance(param_entry, list): results.add_file_list_parameter(name, param_entry) else: results.add_file_parameter(name, param_entry) return results
def test_files_in_command(self, mock_retrieve_call, mock_os_mkdir, mock_setup_upload): def new_retrieve(arg1, arg2, arg3): return {u'files1_out': [u'/test/file1/foo.txt', u'/test/file1/bar.txt']} mock_retrieve_call.side_effect = new_retrieve job_interface_dict, job_data_dict, job_environment_dict = self._get_simple_interface_data_env() job_interface_dict[u'command_arguments'] = u'${files1}' job_interface_dict[u'input_data'] = [{u'name' : u'files1', u'type' : u'files', 'required' : True}] job_data_dict[u'input_data'].append({u'name': u'files1', u'file_ids': [1,2,3]}) job_data_dict[u'output_data'].append({u'name': u'files1_out', u'workspace_id': self.workspace.id}) job_interface = JobInterface(job_interface_dict) job_data = JobData(job_data_dict) job_environment = JobEnvironment(job_environment_dict) job_work_dir = "/test" job_exe_id = 1 job_input_dir = get_job_exe_input_data_dir(job_exe_id) job_interface.perform_pre_steps(job_data, job_environment, 1) job_command_arguments = job_interface.fully_populate_command_argument(job_data, job_environment, job_exe_id) expected_command_arguments = os.path.join(job_input_dir, 'files1') self.assertEqual(job_command_arguments, expected_command_arguments, u'expected a different command from pre_steps') mock_setup_upload.assert_called_once_with(get_job_exe_output_data_dir(job_exe_id), get_job_exe_output_work_dir(job_exe_id), self.workspace)
def perform_post_steps(self, job_exe, job_data, stdoutAndStderr): '''Stores the files and deletes any working directories :param job_exe: The job execution model with related job and job_type fields :type job_exe: :class:`job.models.JobExecution` :param job_data: The job data :type job_data: :class:`job.configuration.data.job_data.JobData` :param stdoutAndStderr: the standard out from the job execution :type stdoutAndStderr: str :return: A tuple of the job results and the results manifest generated by the job execution :rtype: (:class:`job.configuration.results.job_results.JobResults`, :class:`job.configuration.results.results_manifest.results_manifest.ResultsManifest`) ''' manifest_data = {} job_output_dir = get_job_exe_output_data_dir(job_exe.id) path_to_manifest_file = os.path.join(job_output_dir, 'results_manifest.json') if os.path.exists(path_to_manifest_file): logger.info('Opening results manifest...') with open(path_to_manifest_file, 'r') as manifest_file: manifest_data = json.loads(manifest_file.read()) logger.info('Results manifest:') logger.info(manifest_data) else: logger.info('No results manifest found') results_manifest = ResultsManifest(manifest_data) stdout_files = self._get_artifacts_from_stdout(stdoutAndStderr) results_manifest.add_files(stdout_files) results_manifest.validate(self._output_file_manifest_dict) files_to_store = {} for manifest_file_entry in results_manifest.get_files(): param_name = manifest_file_entry['name'] media_type = None output_data_item = self._get_output_data_item_by_name(param_name) if output_data_item: media_type = output_data_item.get('media_type') if 'file' in manifest_file_entry: file_entry = manifest_file_entry['file'] if 'geo_metadata' in file_entry: files_to_store[param_name] = (file_entry['path'], media_type, file_entry['geo_metadata']) else: files_to_store[param_name] = (file_entry['path'], media_type) elif 'files' in manifest_file_entry: file_tuples = [] for file_entry in manifest_file_entry['files']: if 'geo_metadata' in file_entry: file_tuples.append((file_entry['path'], media_type, file_entry['geo_metadata'])) else: file_tuples.append((file_entry['path'], media_type)) files_to_store[param_name] = file_tuples job_data_parse_results = {} # parse results formatted for job_data for parse_result in results_manifest.get_parse_results(): filename = parse_result['filename'] assert filename not in job_data_parse_results geo_metadata = parse_result.get('geo_metadata', {}) geo_json = geo_metadata.get('geo_json', None) data_started = geo_metadata.get('data_started', None) data_ended = geo_metadata.get('data_ended', None) data_types = parse_result.get('data_types', []) new_workspace_path = parse_result.get('new_workspace_path', None) work_dir = None if new_workspace_path: new_workspace_path = os.path.join(new_workspace_path, filename) work_dir = os.path.join( get_job_exe_output_work_dir(job_exe.id), 'move_source_file_in_workspace') job_data_parse_results[filename] = (geo_json, data_started, data_ended, data_types, new_workspace_path, work_dir) job_data.save_parse_results(job_data_parse_results) return (job_data.store_output_data_files(files_to_store, job_exe), results_manifest)
def perform_post_steps(self, job_exe, job_data, stdoutAndStderr): '''Stores the files and deletes any working directories :param job_exe: The job execution model with related job and job_type fields :type job_exe: :class:`job.models.JobExecution` :param job_data: The job data :type job_data: :class:`job.configuration.data.job_data.JobData` :param stdoutAndStderr: the standard out from the job execution :type stdoutAndStderr: str :return: A tuple of the job results and the results manifest generated by the job execution :rtype: (:class:`job.configuration.results.job_results.JobResults`, :class:`job.configuration.results.results_manifest.results_manifest.ResultsManifest`) ''' manifest_data = {} job_output_dir = get_job_exe_output_data_dir(job_exe.id) path_to_manifest_file = os.path.join(job_output_dir, 'results_manifest.json') if os.path.exists(path_to_manifest_file): logger.info('Opening results manifest...') with open(path_to_manifest_file, 'r') as manifest_file: manifest_data = json.loads(manifest_file.read()) logger.info('Results manifest:') logger.info(manifest_data) else: logger.info('No results manifest found') results_manifest = ResultsManifest(manifest_data) stdout_files = self._get_artifacts_from_stdout(stdoutAndStderr) results_manifest.add_files(stdout_files) results_manifest.validate(self._output_file_manifest_dict) files_to_store = {} for manifest_file_entry in results_manifest.get_files(): param_name = manifest_file_entry['name'] media_type = None output_data_item = self._get_output_data_item_by_name(param_name) if output_data_item: media_type = output_data_item.get('media_type') if 'file' in manifest_file_entry: file_entry = manifest_file_entry['file'] if 'geo_metadata' in file_entry: files_to_store[param_name] = (file_entry['path'], media_type, file_entry['geo_metadata']) else: files_to_store[param_name] = (file_entry['path'], media_type) elif 'files' in manifest_file_entry: file_tuples = [] for file_entry in manifest_file_entry['files']: if 'geo_metadata' in file_entry: file_tuples.append((file_entry['path'], media_type, file_entry['geo_metadata'])) else: file_tuples.append((file_entry['path'], media_type)) files_to_store[param_name] = file_tuples job_data_parse_results = {} # parse results formatted for job_data for parse_result in results_manifest.get_parse_results(): filename = parse_result['filename'] assert filename not in job_data_parse_results geo_metadata = parse_result.get('geo_metadata', {}) geo_json = geo_metadata.get('geo_json', None) data_started = geo_metadata.get('data_started', None) data_ended = geo_metadata.get('data_ended', None) data_types = parse_result.get('data_types', []) new_workspace_path = parse_result.get('new_workspace_path', None) work_dir = None if new_workspace_path: new_workspace_path = os.path.join(new_workspace_path, filename) work_dir = os.path.join(get_job_exe_output_work_dir(job_exe.id), 'move_source_file_in_workspace') job_data_parse_results[filename] = (geo_json, data_started, data_ended, data_types, new_workspace_path, work_dir) job_data.save_parse_results(job_data_parse_results) return (job_data.store_output_data_files(files_to_store, job_exe), results_manifest)