def create_databricks_step( input_dataset: Dataset, compute: ComputeTarget, debug_run: bool) -> Tuple[DatabricksStep, PipelineData]: output_data = PipelineData(name="ParquetFiles", datastore=WS.get_default_datastore(), is_directory=True) node_size = 'Standard_DS4_v2' spark_version = '7.3.x-cpu-ml-scala2.12' db_step = DatabricksStep( name='Convert to Parquet', inputs=[input_dataset.as_named_input("CSVFiles")], outputs=[output_data], source_directory="./safe-driver/prep_data", python_script_name='prep_data.py', python_script_params=["--number-of-files", "1"], # Set the number of output files to 1 num_workers=1, compute_target=compute, pypi_libraries=[], allow_reuse=debug_run, node_type=node_size, spark_version=spark_version, ) return db_step, output_data
def create_evaluate_model_step( model_metadata_folder: PipelineData, compute: ComputeTarget, validation_data: Dataset, debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]: """ Creates "Evaluate Model" Step """ output_folder = "./outputs" output_data = PipelineData(name="RegisterModel", datastore=WS.get_default_datastore(), is_directory=True, output_path_on_compute=output_folder, output_mode="upload") eval_step = PythonScriptStep( name="Evaluate Model", script_name="evaluate.py", source_directory='./safe-driver/evaluate/', compute_target=compute, inputs=[model_metadata_folder], outputs=[output_data], arguments=[ "--model-metadata", model_metadata_folder.as_mount(), "--register-model-folder", output_folder, "--validation-data", validation_data.as_named_input("ValidationData").as_mount() ], allow_reuse=debug_run, runconfig=RC) return eval_step, output_data
ws = Workspace.from_config() from azureml.pipeline.steps import PythonScriptStep from azureml.pipeline.core import PipelineData df = Dataset(ws, "my_dataset") ds = ws.get_default_datastore() prepped_data = PipelineData("prepped", ds) step1 = PythonScriptStep(name = "prepare_data", source_directory="Scripts", script_name = "prepare.py", compute_target="my_cluster1", arguments=['--raw_data', df.as_named_input('raw_data'), '--out_folder', prepped_data] outputs=[prepped_data]) step2 = PythonScriptStep(name = "training_data", source_directory="Scripts", script_name = "training.py", compute_target="my_cluster2", arguments = ['--in_folder', prepped_data] input = [prepped_data] ) from azureml.pipeline.core import Pipeline