def test_file_handling_local_file_gets_force_no_copy(): @task def t1() -> FlyteFile: # Use this test file itself, since we know it exists. return FlyteFile(__file__, remote_path=False) @workflow def my_wf() -> FlyteFile: return t1() random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory() fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "mock_remote")) ctx = context_manager.FlyteContext.current_context() with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)): top_level_files = os.listdir(random_dir) assert len(top_level_files) == 1 # the flytekit_local folder workflow_output = my_wf() # After running, this test file should've been copied to the mock remote location. assert not os.path.exists(os.path.join(random_dir, "mock_remote")) # Because Flyte doesn't presume to handle a uri that look like a raw path, the path that is returned is # the original. assert workflow_output.path == __file__
def test_file_handling_local_file_gets_copied(): @task def t1() -> FlyteFile: # Use this test file itself, since we know it exists. return __file__ @workflow def my_wf() -> FlyteFile: return t1() random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory() # print(f"Random: {random_dir}") fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "mock_remote")) ctx = context_manager.FlyteContext.current_context() with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)): top_level_files = os.listdir(random_dir) assert len(top_level_files) == 1 # the flytekit_local folder x = my_wf() # After running, this test file should've been copied to the mock remote location. mock_remote_files = os.listdir(os.path.join(random_dir, "mock_remote")) assert len(mock_remote_files) == 1 # the file # File should've been copied to the mock remote folder assert x.path.startswith(random_dir)
def convert(self, value: typing.Any, param: typing.Optional[click.Parameter], ctx: typing.Optional[click.Context]) -> typing.Any: if FileAccessProvider.is_remote(value): return FileParam(filepath=value) p = pathlib.Path(value) if p.exists() and p.is_file(): return FileParam(filepath=str(p.resolve())) raise click.BadParameter( f"parameter should be a valid file path, {value}")
def test_file_handling_remote_file_handling_flyte_file(): SAMPLE_DATA = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" @task def t1() -> FlyteFile: # Unlike the test above, this returns the remote path wrapped in a FlyteFile object return FlyteFile(SAMPLE_DATA) @workflow def my_wf() -> FlyteFile: return t1() # This creates a random directory that we know is empty. random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory() # Creating a new FileAccessProvider will add two folderst to the random dir fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "mock_remote")) ctx = context_manager.FlyteContext.current_context() with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)): working_dir = os.listdir(random_dir) assert len(working_dir) == 1 # the local_flytekit dir mock_remote_path = os.path.join(random_dir, "mock_remote") assert not os.path.exists(mock_remote_path) # the persistence layer won't create the folder yet workflow_output = my_wf() # After running the mock remote dir should still be empty, since the workflow_output has not been used assert not os.path.exists(mock_remote_path) # While the literal returned by t1 does contain the web address as the uri, because it's a remote address, # flytekit will translate it back into a FlyteFile object on the local drive (but not download it) assert workflow_output.path.startswith(f"{random_dir}{os.sep}local_flytekit") # But the remote source should still be the https address assert workflow_output.remote_source == SAMPLE_DATA # The act of running the workflow should create the engine dir, and the directory that will contain the # file but the file itself isn't downloaded yet. working_dir = os.listdir(os.path.join(random_dir, "local_flytekit")) assert len(working_dir) == 2 # local flytekit and the downloaded file assert not os.path.exists(workflow_output.path) # # The act of opening it should trigger the download, since we do lazy downloading. with open(workflow_output, "rb"): ... # This second layer should have two dirs, a random one generated by the new_execution_context call # and an empty folder, created by FlyteFile transformer's to_python_value function. This folder will have # something in it after we open() it. working_dir = os.listdir(os.path.join(random_dir, "local_flytekit")) assert len(working_dir) == 3 # local flytekit and the downloaded file assert os.path.exists(workflow_output.path) # The file name is maintained on download. assert str(workflow_output).endswith(os.path.split(SAMPLE_DATA)[1])
def test_transformer_to_literal_local(): random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory() fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "raw")) ctx = context_manager.FlyteContext.current_context() with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)) as ctx: # Use a separate directory that we know won't be the same as anything generated by flytekit itself, lest we # accidentally try to cp -R /some/folder /some/folder/sub which causes exceptions obviously. p = "/tmp/flyte/test_fd_transformer" # Create an empty directory and call to literal on it if os.path.exists(p): shutil.rmtree(p) pathlib.Path(p).mkdir(parents=True) tf = FlyteDirToMultipartBlobTransformer() lt = tf.get_literal_type(FlyteDirectory) literal = tf.to_literal(ctx, FlyteDirectory(p), FlyteDirectory, lt) assert literal.scalar.blob.uri.startswith(random_dir) # Create a director with one file in it if os.path.exists(p): shutil.rmtree(p) pathlib.Path(p).mkdir(parents=True) with open(os.path.join(p, "xyz"), "w") as fh: fh.write("Hello world\n") literal = tf.to_literal(ctx, FlyteDirectory(p), FlyteDirectory, lt) mock_remote_files = os.listdir(literal.scalar.blob.uri) assert mock_remote_files == ["xyz"] # The only primitives allowed are strings with pytest.raises(AssertionError): tf.to_literal(ctx, 3, FlyteDirectory, lt) with pytest.raises(TypeError, match="No automatic conversion from <class 'int'>"): TypeEngine.to_literal(ctx, 3, FlyteDirectory, lt) # Can't use if it's not a directory with pytest.raises(FlyteAssertion): p = "/tmp/flyte/xyz" path = pathlib.Path(p) try: path.unlink() except OSError: ... with open(p, "w") as fh: fh.write("hello world\n") tf.to_literal(ctx, FlyteDirectory(p), FlyteDirectory, lt)
def convert(self, value: typing.Any, param: typing.Optional[click.Parameter], ctx: typing.Optional[click.Context]) -> typing.Any: if FileAccessProvider.is_remote(value): return Directory(dir_path=value, local=False) p = pathlib.Path(value) if p.exists() and p.is_dir(): files = list(p.iterdir()) if len(files) != 1: raise ValueError( f"Currently only directories containing one file are supported, found [{len(files)}] files found in {p.resolve()}" ) return Directory(dir_path=value, local_file=files[0].resolve()) raise click.BadParameter( f"parameter should be a valid directory path, {value}")
def test_transformer_to_literal_remote(): random_dir = context_manager.FlyteContext.current_context().file_access.get_random_local_directory() fs = FileAccessProvider(local_sandbox_dir=random_dir, raw_output_prefix=os.path.join(random_dir, "raw")) ctx = context_manager.FlyteContext.current_context() with context_manager.FlyteContextManager.with_context(ctx.with_file_access(fs)) as ctx: # Use a separate directory that we know won't be the same as anything generated by flytekit itself, lest we # accidentally try to cp -R /some/folder /some/folder/sub which causes exceptions obviously. p = "/tmp/flyte/test_fd_transformer" # Create an empty directory and call to literal on it if os.path.exists(p): shutil.rmtree(p) pathlib.Path(p).mkdir(parents=True) tf = FlyteDirToMultipartBlobTransformer() lt = tf.get_literal_type(FlyteDirectory) # Remote directories should be copied as is. literal = tf.to_literal(ctx, FlyteDirectory("s3://anything"), FlyteDirectory, lt) assert literal.scalar.blob.uri == "s3://anything"
def setup_execution( raw_output_data_prefix: str, checkpoint_path: Optional[str] = None, prev_checkpoint: Optional[str] = None, dynamic_addl_distro: Optional[str] = None, dynamic_dest_dir: Optional[str] = None, ): """ :param raw_output_data_prefix: :param checkpoint_path: :param prev_checkpoint: :param dynamic_addl_distro: Works in concert with the other dynamic arg. If present, indicates that if a dynamic task were to run, it should set fast serialize to true and use these values in FastSerializationSettings :param dynamic_dest_dir: See above. :return: """ exe_project = get_one_of("FLYTE_INTERNAL_EXECUTION_PROJECT", "_F_PRJ") exe_domain = get_one_of("FLYTE_INTERNAL_EXECUTION_DOMAIN", "_F_DM") exe_name = get_one_of("FLYTE_INTERNAL_EXECUTION_ID", "_F_NM") exe_wf = get_one_of("FLYTE_INTERNAL_EXECUTION_WORKFLOW", "_F_WF") exe_lp = get_one_of("FLYTE_INTERNAL_EXECUTION_LAUNCHPLAN", "_F_LP") tk_project = get_one_of("FLYTE_INTERNAL_TASK_PROJECT", "_F_TK_PRJ") tk_domain = get_one_of("FLYTE_INTERNAL_TASK_DOMAIN", "_F_TK_DM") tk_name = get_one_of("FLYTE_INTERNAL_TASK_NAME", "_F_TK_NM") tk_version = get_one_of("FLYTE_INTERNAL_TASK_VERSION", "_F_TK_V") compressed_serialization_settings = os.environ.get(SERIALIZED_CONTEXT_ENV_VAR, "") ctx = FlyteContextManager.current_context() # Create directories user_workspace_dir = ctx.file_access.get_random_local_directory() logger.info(f"Using user directory {user_workspace_dir}") pathlib.Path(user_workspace_dir).mkdir(parents=True, exist_ok=True) from flytekit import __version__ as _api_version checkpointer = None if checkpoint_path is not None: checkpointer = SyncCheckpoint(checkpoint_dest=checkpoint_path, checkpoint_src=prev_checkpoint) logger.debug(f"Checkpointer created with source {prev_checkpoint} and dest {checkpoint_path}") execution_parameters = ExecutionParameters( execution_id=_identifier.WorkflowExecutionIdentifier( project=exe_project, domain=exe_domain, name=exe_name, ), execution_date=_datetime.datetime.utcnow(), stats=_get_stats( cfg=StatsConfig.auto(), # Stats metric path will be: # registration_project.registration_domain.app.module.task_name.user_stats # and it will be tagged with execution-level values for project/domain/wf/lp prefix=f"{tk_project}.{tk_domain}.{tk_name}.user_stats", tags={ "exec_project": exe_project, "exec_domain": exe_domain, "exec_workflow": exe_wf, "exec_launchplan": exe_lp, "api_version": _api_version, }, ), logging=user_space_logger, tmp_dir=user_workspace_dir, raw_output_prefix=raw_output_data_prefix, checkpoint=checkpointer, task_id=_identifier.Identifier(_identifier.ResourceType.TASK, tk_project, tk_domain, tk_name, tk_version), ) try: file_access = FileAccessProvider( local_sandbox_dir=tempfile.mkdtemp(prefix="flyte"), raw_output_prefix=raw_output_data_prefix, ) except TypeError: # would be thrown from DataPersistencePlugins.find_plugin logger.error(f"No data plugin found for raw output prefix {raw_output_data_prefix}") raise es = ctx.new_execution_state().with_params( mode=ExecutionState.Mode.TASK_EXECUTION, user_space_params=execution_parameters, ) cb = ctx.new_builder().with_file_access(file_access).with_execution_state(es) if compressed_serialization_settings: ss = SerializationSettings.from_transport(compressed_serialization_settings) ssb = ss.new_builder() ssb.project = exe_project ssb.domain = exe_domain ssb.version = tk_version if dynamic_addl_distro: ssb.fast_serialization_settings = FastSerializationSettings( enabled=True, destination_dir=dynamic_dest_dir, distribution_location=dynamic_addl_distro, ) cb = cb.with_serialization_settings(ssb.build()) with FlyteContextManager.with_context(cb) as ctx: yield ctx
def test_is_remote(): fp = FileAccessProvider("/tmp", "s3://my-bucket") assert fp.is_remote("./checkpoint") is False assert fp.is_remote("/tmp/foo/bar") is False assert fp.is_remote("file://foo/bar") is False assert fp.is_remote("s3://my-bucket/foo/bar") is True
def test_get_random_remote_path(): fp = FileAccessProvider("/tmp", "s3://my-bucket") path = fp.get_random_remote_path() assert path.startswith("s3://my-bucket")