def test_relative_path_list(tmp_path): file_list = tmp_path / "list.txt" CharSink.to_file(file_list).write("\n".join(["fred/bob.txt", "foo.txt"])) params = Parameters.from_mapping({"file_list": str(file_list)}) assert list( params.path_list_from_file( "file_list", resolve_relative_to=Path("/hello/world"))) == [ Path("/hello/world/fred/bob.txt"), Path("/hello/world/foo.txt") ]
def test_relative_path_map(tmp_path): file_map = tmp_path / "map.txt" CharSink.to_file(file_map).write("\n".join( ["one\tfred/bob.txt", "two\tfoo.txt"])) params = Parameters.from_mapping({"file_map": str(file_map)}) assert dict( params.path_map_from_file( "file_map", resolve_relative_to=Path("/hello/world"))) == { "one": Path("/hello/world/fred/bob.txt"), "two": Path("/hello/world/foo.txt") }
def test_to_file_open(self): tmp_dir = Path(tempfile.mkdtemp()) file_path = tmp_dir / "test.txt" with CharSink.to_file(file_path).open() as out: out.write("hello\n\nworld\n") source = CharSource.from_file(file_path) self.assertEqual("hello\n\nworld\n", source.read_all()) shutil.rmtree(str(tmp_dir))
def test_to_file_write_string_arg(self): tmp_dir = Path(tempfile.mkdtemp()) file_path = tmp_dir / "test.txt" sink = CharSink.to_file(str(file_path)) sink.write("hello\n\nworld\n") source = CharSource.from_file(str(file_path)) self.assertEqual("hello\n\nworld\n", source.read_all()) shutil.rmtree(str(tmp_dir))
def test_writing_to_yaml(self): params = Parameters.from_mapping({ "hello": "world", "moo": { "nested_dict": { "lalala": "fooo", "meep": 2, "list": [1, 2, 3] } }, "some_path": Path("/hello/world"), "path_list": [Path("/meep/lalala"), Path("/moo/cow")], }) string_buffer = CharSink.to_string() YAMLParametersWriter().write(params, string_buffer) self.assertEqual(TestParameters.WRITING_REFERENCE, string_buffer.last_string_written) with self.assertRaisesRegex( RuntimeError, "bytes and bytearrays are not legal parameter values"): YAMLParametersWriter().write( Parameters.from_mapping({"illegal": b"bytes"}), CharSink.to_nowhere()) with self.assertRaisesRegex( RuntimeError, "bytes and bytearrays are not legal parameter values"): YAMLParametersWriter().write( Parameters.from_mapping({"illegal": bytearray()}), CharSink.to_nowhere()) with self.assertRaisesRegex( RuntimeError, "Don't know how to serialize out .* as a parameter value"): YAMLParametersWriter().write( Parameters.from_mapping({"illegal": Parameters}), CharSink.to_nowhere())
def test_read_write_doc_id_to_file_map(self): mapping = ImmutableDict.of([("foo", Path("/home/foo")), ("bar", Path("/home/bar"))]) string_sink = CharSink.to_string() write_doc_id_to_file_map(mapping, string_sink) # note the reordering because it alphabetizes the docids self.assertEqual("bar\t/home/bar\nfoo\t/home/foo\n", string_sink.last_string_written) reloaded_map = read_doc_id_to_file_map( CharSource.from_string(string_sink.last_string_written)) self.assertEqual(mapping, reloaded_map)
def _split_into_even_slices(input_source: KeyValueSource[str, bytes], params: Parameters): output_directory = params.creatable_directory("output_dir") slices = params.positive_integer("num_slices") random_seed = params.optional_positive_integer("random_seed") slice_paths = [ output_directory / "{!s}.zip".format(i) for i in range(slices) ] CharSink.to_file(output_directory / "_slices.txt").write("\n".join( str(x) for x in slice_paths)) output_sinks = [ KeyValueSink.zip_bytes_sink(slice_path) for slice_path in slice_paths ] # this is the magic incantation for handling variable-length lists of context managers with ExitStack() as exit_stack: for output_sink in output_sinks: exit_stack.enter_context(output_sink) input_keys = sorted(list(input_source.keys()) # type: ignore ) # guarantee deterministic iteration order if random_seed: random.seed(random_seed) random.shuffle(input_keys) for (i, k) in enumerate(input_keys): output_sinks[i % slices].put(k, input_source[k])
def write_shell_script_to( self, entry_point_name: str, parameters: Union[Path, Parameters], *, working_directory: Path, script_path: Path, params_path: Optional[Path], stdout_file: Optional[Path] = None, ckpt_path: Optional[Path] = None, override_conda_config: Optional[CondaConfiguration] = None, ) -> None: if isinstance(parameters, Path): if params_path: raise RuntimeError( "Cannot specify params_path and provide a path for parameters" ) params_path = parameters elif isinstance(parameters, Parameters): if not params_path: raise RuntimeError( "Params path must be specified when providing a parameters object" ) YAMLParametersWriter().write(parameters, CharSink.to_file(params_path)) else: raise RuntimeError( f"Parameters must be either Parameters or path to a param file, " f"but got {parameters}") if not stdout_file: stdout_file = working_directory / "___stdout.log" script_path.write_text( self.generate_shell_script( entry_point_name=entry_point_name, param_file=params_path, stdout_file=stdout_file, working_directory=working_directory, ckpt_path=ckpt_path, override_conda_config=override_conda_config, ), encoding="utf-8", ) # Mark the generated script as executable. script_path.chmod(script_path.stat().st_mode | stat.S_IEXEC)
def test_null_sink(self): sink = CharSink.to_nowhere() sink.write("foo") with sink.open() as out: out.write("meep")
def test_string_sink(self): string_sink = CharSink.to_string() string_sink.write("hello world") self.assertEqual("hello world", string_sink.last_string_written)
def _run_python_in_container( self, job_name: Locator, python_module_or_path_on_docker: Union[str, Path], python_args_or_parameters: Union[Parameters, str], container: Container, *, depends_on, docker_args: str = "", python_executable_path_in_docker: Path = PYTHON_EXECUTABLE_DOCKER_PATH, input_files: Union[Iterable[Union[Path, str]], Path, str] = immutableset(), output_files: Union[Iterable[Union[Path, str]], Path, str] = immutableset(), docker_mount_root: Path = DOCKER_MOUNT_ROOT, resource_request: Optional[ResourceRequest] = None, category: Optional[str] = None, pre_docker_bash: Union[Iterable[str], str] = "", post_docker_bash: Union[Iterable[str], str] = "", job_is_stageable: bool = False, job_bypass_staging: bool = False, times_to_retry_job: int = 0, job_profiles: Iterable[PegasusProfile] = immutableset(), ) -> DependencyNode: """ Automatically converts a python job into a container request """ # Ensure the input and output files are iterables of Path or str if isinstance(input_files, (Path, str)): input_files = immutableset([input_files]) if isinstance(output_files, (Path, str)): output_files = immutableset([output_files]) # A set to keep track of all the file names that will be created or copied into # The mounted directory. We use this to raise errors if a duplicate name would appear params_file_name = "____params.params" params_file = None file_names = set(params_file_name) job_dir = self.directory_for(job_name) # Define the root mount point for scratch mount scratch_root = DOCKERMOUNT_SCRATCH_PATH_ROOT / self.name / str( job_name) # Define the self-needed docker args modified_docker_args = ( f"--rm -v {scratch_root}:{docker_mount_root} " + docker_args) # Build paths mappings for docker mapping_input_files = [] for i_file in input_files: if i_file.name in file_names: raise RuntimeError( f"Unable to create container job {job_name} with multiple files with name {i_file.name}" ) file_names.add(i_file.name) mapping_input_files.append(( str(i_file.absolute()), PegasusContainerFile( name=i_file.name, nas=i_file, scratch=scratch_root / i_file.name, docker=docker_mount_root / i_file.name, ), )) converted_input_files = immutabledict(mapping_input_files) mapping_output_files = [] for o_file in output_files: if o_file.name in file_names: raise RuntimeError( f"Unable to create container job {job_name} with multiple files with name {o_file.name}" ) file_names.add(o_file.name) mapping_output_files.append(( str(o_file.absolute()), PegasusContainerFile( name=o_file.name, nas=o_file, scratch=scratch_root / o_file.name, docker=docker_mount_root / o_file.name, ), )) converted_output_files = immutabledict(mapping_output_files) # Process the Python Parameters or Args for any file paths which need to change if isinstance(python_args_or_parameters, Parameters): mutable_params = dict(python_args_or_parameters.as_mapping()) for key, value in python_args_or_parameters.as_mapping().items(): if isinstance(value, Path): if str(value.absolute()) in converted_input_files: mutable_params[key] = str(converted_input_files[str( value.absolute())].docker.absolute()) elif str(value.absolute()) in converted_output_files: mutable_params[key] = str(converted_output_files[str( value.absolute())].docker.absolute()) modified_params = Parameters.from_mapping(mutable_params) params_path = job_dir / params_file_name YAMLParametersWriter().write(modified_params, CharSink.to_file(params_path)) params_file = PegasusContainerFile( name=params_file_name, nas=params_path, scratch=scratch_root / params_file_name, docker=docker_mount_root / params_file_name, ) python_args = params_file.docker elif isinstance(python_args_or_parameters, str): python_args_tok = [] for tok in python_args_or_parameters.split(" "): if tok in converted_input_files: python_args_tok.append( str(converted_input_files[tok].docker.absolute())) elif tok in converted_output_files: python_args_tok.append( str(converted_output_files[tok].docker.absolute())) else: python_args_tok.append(tok) python_args = " ".join(python_args_tok) else: raise RuntimeError( f"Cannot handle python_args_or_parameters of type {type(python_args_or_parameters)}. Data: {python_args_or_parameters}" ) # Combine any user requested pre-docker bash with automatic # Movement of files from NAS locations to /scratch dir locations pre_job_bash = "\n".join( chain( [ f"mkdir -p {scratch_root}", f"cp {str(params_file.nas.absolute())} {str(params_file.scratch.absolute())}" if params_file else "", ], [ f"cp {str(i_file.nas.absolute())} {str(i_file.scratch.absolute())}" for i_file in converted_input_files.values() ], pre_docker_bash, )) # Combine any user requested post-docker bash with automatic # Movement of files from /scratch locations to NAS locations post_job_bash = "\n".join( chain( [ f"cp {str(o_file.scratch.absolute())} {str(o_file.nas.absolute())}" for o_file in converted_output_files.values() ], post_docker_bash, )) # Generate the command to run the python job python_start = (f"-m {python_module_or_path_on_docker}" if isinstance( python_module_or_path_on_docker, str) else str(python_module_or_path_on_docker)) docker_run_command = ( f"{python_executable_path_in_docker} {python_start} {python_args}") return self.run_container( job_name, container.name, modified_docker_args, docker_run_command, container.image, depends_on=depends_on, job_is_stageable=job_is_stageable, job_bypass_staging=job_bypass_staging, times_to_retry_job=times_to_retry_job, job_profiles=job_profiles, pre_job_bash=pre_job_bash, post_job_bash=post_job_bash, category=category, resource_request=resource_request, )
def _run_python_job( self, job_name: Locator, python_module_or_path: Any, args_or_params: Union[Parameters, Dict[str, Any], str], *, depends_on, resource_request: Optional[ResourceRequest] = None, override_conda_config: Optional[CondaConfiguration] = None, category: Optional[str] = None, use_pypy: bool = False, container: Optional[Container] = None, pre_job_bash: str = "", post_job_bash: str = "", job_is_stageable: bool = False, job_bypass_staging: bool = False, times_to_retry_job: int = 0, job_profiles: Iterable[PegasusProfile] = immutableset(), treat_params_as_cmd_args: bool = False, input_file_paths: Union[Iterable[Union[Path, str]], Path, str] = immutableset(), output_file_paths: Union[Iterable[Union[Path, str]], Path, str] = immutableset(), ) -> DependencyNode: """ Internal function to schedule a python job for centralized logic. """ job_dir = self.directory_for(job_name) ckpt_name = job_name / "___ckpt" checkpoint_path = job_dir / "___ckpt" signature_args = None depends_on = _canonicalize_depends_on(depends_on) if isinstance(python_module_or_path, (str, Path)): computed_module_or_path = python_module_or_path else: computed_module_or_path = fully_qualified_name( python_module_or_path) if not isinstance(args_or_params, str): # allow users to specify the parameters as a dict for convenience if not isinstance(args_or_params, Parameters): args_or_params = Parameters.from_mapping(args_or_params) params_sink = CharSink.to_string() YAMLParametersWriter().write(args_or_params, params_sink) signature_args = params_sink.last_string_written signature = ( computed_module_or_path, signature_args if signature_args else args_or_params, ) if signature in self._signature_to_job: logging.info("Job %s recognized as a duplicate", job_name) return self._signature_to_job[signature] if container: return self._run_python_in_container( job_name, computed_module_or_path, args_or_params, container, depends_on=depends_on, input_files=input_file_paths, output_files=output_file_paths, resource_request=resource_request, category=category, pre_docker_bash=pre_job_bash, post_docker_bash=post_job_bash, job_is_stageable=job_is_stageable, job_bypass_staging=job_bypass_staging, times_to_retry_job=times_to_retry_job, job_profiles=job_profiles, ) script_path = job_dir / "___run.sh" stdout_path = job_dir / "___stdout.log" self._conda_script_generator.write_shell_script_to( entry_point_name=computed_module_or_path, parameters=args_or_params, working_directory=job_dir, script_path=script_path, params_path=job_dir / "____params.params", stdout_file=stdout_path, ckpt_path=checkpoint_path, override_conda_config=override_conda_config, python="pypy3" if use_pypy else "python", pre_job=pre_job_bash, post_job=post_job_bash, treat_params_as_cmd_args=treat_params_as_cmd_args, ) script_executable = Transformation( self._job_name_for(job_name), namespace=self._namespace, version="4.0", site=self._default_site, pfn=script_path, is_stageable=job_is_stageable, bypass_staging=job_bypass_staging, arch=Arch.X86_64, os_type=OS.LINUX, container=container, ) self._transformation_catalog.add_transformations(script_executable) resource_request = self.set_resource_request(resource_request) job = Job(script_executable) dependency_node = self._update_job_settings( category, checkpoint_path, ckpt_name, depends_on, job, job_name, job_profiles, resource_request, times_to_retry_job, ) self._signature_to_job[signature] = dependency_node logging.info("Scheduled Python job %s", job_name) return dependency_node
def write_shell_script_to( self, entry_point_name: Union[str, Path], parameters: Union[Path, Parameters, str], *, working_directory: Path, script_path: Path, params_path: Optional[Path], stdout_file: Optional[Path] = None, ckpt_path: Optional[Path] = None, override_conda_config: Optional[CondaConfiguration] = None, python: str = "python", treat_params_as_cmd_args: bool = False, pre_job: str = "", post_job: str = "", ) -> None: if isinstance(parameters, Path): if params_path: raise RuntimeError( "Cannot specify params_path and provide a path for parameters" ) params_path = parameters elif isinstance(parameters, Parameters): if not params_path: raise RuntimeError( "Params path must be specified when providing a parameters object" ) YAMLParametersWriter().write(parameters, CharSink.to_file(params_path)) elif isinstance(parameters, str): if not treat_params_as_cmd_args: raise RuntimeError( "Parameters can only be a str when the parameters are being treated as command line args" ) else: raise RuntimeError( f"Parameters must be either Parameters, path to a param file, " f"or a string if treat_params_as_cmd_args is True, " f"but got {parameters}" ) if not stdout_file: stdout_file = working_directory / "___stdout.log" script_path.write_text( self.generate_shell_script( entry_point_name=entry_point_name if isinstance(entry_point_name, str) else None, python_path=str(entry_point_name.absolute()).strip() if isinstance(entry_point_name, Path) else None, param_file=params_path if not treat_params_as_cmd_args else None, cmd_args=parameters.strip() if treat_params_as_cmd_args else None, stdout_file=stdout_file, working_directory=working_directory, ckpt_path=ckpt_path, override_conda_config=override_conda_config, python=python, pre_job=pre_job, post_job=post_job, ), encoding="utf-8", ) # Mark the generated script as executable. script_path.chmod(script_path.stat().st_mode | stat.S_IEXEC)
def run_python_on_parameters( self, job_name: Locator, python_module: Any, parameters: Union[Parameters, Dict[str, Any]], *, depends_on, resource_request: Optional[ResourceRequest] = None, override_conda_config: Optional[CondaConfiguration] = None, category: Optional[str] = None, ) -> DependencyNode: """ Schedule a job to run the given *python_module* on the given *parameters*. If this job requires other jobs to be executed first, include them in *depends_on*. This method returns a `DependencyNode` which can be used in *depends_on* for future jobs. """ job_dir = self.directory_for(job_name) ckpt_name = job_name / "___ckpt" checkpoint_path = job_dir / "___ckpt" depends_on = _canonicalize_depends_on(depends_on) if isinstance(python_module, str): fully_qualified_module_name = python_module else: fully_qualified_module_name = fully_qualified_name(python_module) # allow users to specify the parameters as a dict for convenience if not isinstance(parameters, Parameters): parameters = Parameters.from_mapping(parameters) # If we've already scheduled this identical job, # then don't schedule it again. params_sink = CharSink.to_string() YAMLParametersWriter().write(parameters, params_sink) signature = (fully_qualified_module_name, params_sink.last_string_written) if signature in self._signature_to_job: logging.info("Job %s recognized as a duplicate", job_name) return self._signature_to_job[signature] script_path = job_dir / "___run.sh" stdout_path = parameters.string( "logfile", default=str((job_dir / "___stdout.log").absolute())) self._conda_script_generator.write_shell_script_to( entry_point_name=fully_qualified_module_name, parameters=parameters, working_directory=job_dir, script_path=script_path, params_path=job_dir / "____params.params", stdout_file=stdout_path, ckpt_path=checkpoint_path, override_conda_config=override_conda_config, ) script_executable = Executable( namespace=self._namespace, name=str(job_name).replace("/", "_"), version="4.0", os="linux", arch="x86_64", ) script_executable.addPFN( path_to_pfn(script_path, site=self._default_site)) if not self._job_graph.hasExecutable(script_executable): self._job_graph.addExecutable(script_executable) job = Job(script_executable) self._job_graph.addJob(job) for parent_dependency in depends_on: if parent_dependency.job: self._job_graph.depends(job, parent_dependency.job) for out_file in parent_dependency.output_files: job.uses(out_file, link=Link.INPUT) if resource_request is not None: resource_request = self.default_resource_request.unify( resource_request) else: resource_request = self.default_resource_request if category: job.profile(Namespace.DAGMAN, "category", category) resource_request.apply_to_job(job, job_name=self._job_name_for(job_name)) # Handle Output Files # This is currently only handled as the checkpoint file # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25 checkpoint_pegasus_file = path_to_pegasus_file(checkpoint_path, site=self._default_site, name=f"{ckpt_name}") if checkpoint_pegasus_file not in self._added_files: self._job_graph.addFile(checkpoint_pegasus_file) self._added_files.add(checkpoint_pegasus_file) # If the checkpoint file already exists, we want to add it to the replica catalog # so that we don't run the job corresponding to the checkpoint file again if checkpoint_path.exists(): with self._replica_catalog.open("a+") as handle: handle.write( f"{ckpt_name} file://{checkpoint_path} site={self._default_site}\n" ) job.uses(checkpoint_pegasus_file, link=Link.OUTPUT, transfer=True) dependency_node = DependencyNode.from_job( job, output_files=[checkpoint_pegasus_file]) self._signature_to_job[signature] = dependency_node logging.info("Scheduled Python job %s", job_name) return dependency_node
def __exit__(self, exc_type, exc_val, exc_tb) -> None: write_doc_id_to_file_map(self.id_to_file, CharSink.to_file(self._path / "_index"))
def put(self, key: str, value: str) -> None: out_file = self._path / key CharSink.to_file(out_file).write(value) self.id_to_file[key] = out_file