def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
print("training step created") # Define Pipeline pipeline = Pipeline(workspace=ws, steps=[train_xception]) print("Pipeline is built") # Validate Pipeline pipeline.validate() print("Validation complete") pipeline_name = 'kd_train_the_teacher' # We need to disable (delete) previously published pipelines, because we can't have two published pipelines with the same name from utils.azure import disable_pipeline disable_pipeline(pipeline_name=pipeline_name, prefix='', dry_run=False) # Publish Pipeline published_pipeline = pipeline.publish(name=pipeline_name) print("Pipeline is built") # Put the pipeline on a schedule schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=path_on_datastore, polling_interval=60)
def disable_pipeline(pipeline_name="", dry_run=True): from azureml.pipeline.core import PublishedPipeline from azureml.pipeline.core.schedule import Schedule if dry_run: print("Dry run: only printing what would be done") else: print("Disabling pipelines") ws = get_workspace() # Get all published pipeline objects in the workspace all_pub_pipelines = PublishedPipeline.list(ws) # We will iterate through the list of published pipelines and # use the last ID in the list for Schedule operations: print("Published pipelines found in the workspace:") for pub_pipeline in all_pub_pipelines: if ( pub_pipeline.name.startswith("prednet") and pub_pipeline.name == pipeline_name or pipeline_name == "" ): print("Found pipeline:", pub_pipeline.name, pub_pipeline.id) pub_pipeline_id = pub_pipeline.id schedules = Schedule.list(ws, pipeline_id=pub_pipeline_id) # We will iterate through the list of schedules and # use the last ID in the list for further operations: print( "Found these schedules for the pipeline id {}:".format( pub_pipeline_id ) ) for schedule in schedules: print(schedule.name, schedule.id) if not dry_run: schedule_id = schedule.id print( "Schedule id to be used for schedule " "operations: {}".format( schedule_id ) ) fetched_schedule = Schedule.get(ws, schedule_id) print( "Using schedule with id: {}".format( fetched_schedule.id ) ) fetched_schedule.disable(wait_for_provisioning=True) fetched_schedule = Schedule.get(ws, schedule_id) print( "Disabled schedule {}. New status is: {}".format( fetched_schedule.id, fetched_schedule.status ) ) if not dry_run: print("Disabling pipeline") pub_pipeline.disable()
# --- add a schedule for the pipeline (if told to do so) # note: this is a sample schedule which runs time-based. # there is also the option to trigger the pipeline based on changes. # details at https://github.com/Azure/MachineLearningNotebooks/blob/4e7b3784d50e81c313c62bcdf9a330194153d9cd/how-t # o-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipelin # e.ipynb if schedule: recurrence = ScheduleRecurrence(frequency="Day", interval=2, hours=[22], minutes=[30]) schedule = Schedule.create( workspace=workspace, name="Every-Other-Day-At-10-30-PM", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, recurrence=recurrence, wait_for_provisioning=True, description="A sample schedule which runs every other day at 10:30pm.", ) # --- trigger pipeline endpoint if we have been told to do so if trigger_after_publish == True: print(f"Triggering pipeline endpoint '{pipeline_name}' (as configured)...") pipeline_run = Experiment(workspace, pipeline_name).submit(pipeline_endpoint) # pipeline_run.wait_for_completion() # --- Done print("Done.")
published_pipeline = pipeline.publish(name=pipeline_name) print("pipeline id: ", published_pipeline.id) datastore = ws.get_default_datastore() with open("placeholder.txt", "w") as f: f.write("This is just a placeholder to ensure " "that this path exists in the blobstore.\n") datastore.upload_files( [os.path.join(os.getcwd(), "placeholder.txt")], target_path="prednet/data/raw_data/", ) schedule = Schedule.create( workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name="prednet_master", datastore=datastore, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore="prednet/data/raw_data", polling_interval=5, ) print("Created schedule with id: {}".format(schedule.id)) published_pipeline.submit(ws, published_pipeline.name)
def createPipeline(self): ''' A pipeline is a series of steps but also requires DataReference objects in those steps so that it where to get data from and where to deposit outputs. In this step, if a PublishedPipeline exists by name, a new pipeline is not created. If it is created a new docker conainer is generated in the ACR instance associated with this AMLS workspace. ''' self.publishedPipeline = getExistingPipeline( self.workspace, self.programArguments.pipeline_name, self.job_log) if self.publishedPipeline: print("Found existing pipeline - ", self.programArguments.pipeline_name) if self.job_log: self.job_log.addInfo("{} - PublishedPipeline {} exists".format( self.job_log.lastStep(), self.programArguments.pipeline_name)) else: print("Creating pipeline - ", self.programArguments.pipeline_name) if self.job_log: self.job_log.addInfo( "{} - Creating PublishedPipeline {}".format( self.job_log.lastStep(), self.programArguments.pipeline_name)) print("Creating pipeline steps .....") self._createPipelineSteps() self.pipeLine = Pipeline(workspace=self.workspace, steps=self.pipelineStep) self.pipeLine.validate() print("Publishing pipeline .....") self.publishedPipeline = self.pipeLine.publish( name=self.programArguments.pipeline_name, description="Dummy Pipeline") ''' Now we schedule it. This step on it's own will create the AMLS experiment tied to this service. Unlike with the RTS example, no model is creted in this step. Next we generate the schedule recurrence, when this pipeline should run, and finally create the schedule by identifying the published pipeline that is being requested. ''' print("Scheduling pipeline .....") experiment_name = "exp_" + self.programArguments.pipeline_name recurrence = ScheduleRecurrence( frequency=self.programArguments.schedule_frequency, interval=self.programArguments.schedule_interval) self.Schedule = Schedule.create( workspace=self.workspace, name="{}_sched".format(self.programArguments.pipeline_name), pipeline_id=self.publishedPipeline.id, experiment_name=experiment_name, recurrence=recurrence, description="Pipeline schedule for {}".format( self.programArguments.pipeline_name), ) ''' Print out what we know of the pipeline. In particular it's status and the endpoint. ''' print("Pipeline : ", self.publishedPipeline.name) print("Pipeline Endpoint: ", self.publishedPipeline.endpoint) print("Pipeline Status: ", self.publishedPipeline.status)
def build_prednet_pipeline(dataset, ws): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = "." def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = "./scripts" os.makedirs(script_folder) shutil.copytree(os.path.join(base_dir, "models"), os.path.join(base_dir, script_folder, "models")) shutil.copy(os.path.join(base_dir, "train.py"), script_folder) shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder) shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder) shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder) cpu_compute_name = args.cpu_compute_name cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = args.gpu_compute_name gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print(gpu_compute_target.get_status().serialize()) env = Environment.get(ws, "prednet") # Runconfigs runconfig = RunConfiguration() runconfig.environment = env print("PipelineData object created") # DataReference to where raw data is stored. raw_data = DataReference( datastore=def_blob_store, data_reference_name="raw_data", path_on_datastore=os.path.join("prednet", "data", "raw_data"), ) print("DataReference object created") # Naming the intermediate data as processed_data and assigning it to the # variable processed_data. preprocessed_data = PipelineData("preprocessed_data", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store) # prednet_path = PipelineData("outputs", datastore=def_blob_store) scored_data = PipelineData("scored_data", datastore=def_blob_store) model_path = PipelineData("model_path", datastore=def_blob_store) # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name="prepare_data", script_name="data_preparation.py", arguments=[ "--raw_data", raw_data, "--preprocessed_data", preprocessed_data, "--dataset", dataset, ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) # data_prep.run_after(video_decoding) print("data_prep step created") est = Estimator( source_directory=script_folder, compute_target=gpu_compute_target, entry_script="train.py", node_count=1, environment_definition=env, ) ps = BayesianParameterSampling({ "--batch_size": choice(1, 2, 4, 10), "--filter_sizes": choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), "--stack_sizes": choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), "--learning_rate": uniform(1e-6, 1e-3), "--lr_decay": uniform(1e-9, 1e-2), "--freeze_layers": choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), # "--fine_tuning": choice("True", "False"), }) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, primary_metric_name="val_loss", primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=3, max_concurrent_runs=3, max_duration_minutes=60 * 6, ) train_prednet = HyperDriveStep( "train_w_hyperdrive", hdc, estimator_entry_script_arguments=[ "--preprocessed_data", preprocessed_data, "--remote_execution", "--dataset", dataset, ], inputs=[preprocessed_data], outputs=[hd_child_cwd], metrics_output=data_metrics, allow_reuse=True, ) train_prednet.run_after(data_prep) register_prednet = PythonScriptStep( name="register_prednet", script_name="register_prednet.py", arguments=[ "--data_metrics", data_metrics, ], compute_target=cpu_compute_target, inputs=[data_metrics, hd_child_cwd], source_directory=script_folder, allow_reuse=True, ) register_prednet.run_after(train_prednet) batch_scoring = PythonScriptStep( name="batch_scoring", script_name="batch_scoring.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--dataset", dataset, # "--prednet_path", # prednet_path ], compute_target=gpu_compute_target, inputs=[preprocessed_data], outputs=[scored_data], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) batch_scoring.run_after(register_prednet) train_clf = PythonScriptStep( name="train_clf", script_name="train_clf.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--model_path", model_path ], compute_target=cpu_compute_target, inputs=[preprocessed_data, scored_data], outputs=[model_path], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) train_clf.run_after(batch_scoring) register_clf = PythonScriptStep( name="register_clf", script_name="register_clf.py", arguments=["--model_path", model_path], inputs=[model_path], compute_target=cpu_compute_target, source_directory=script_folder, allow_reuse=True, runconfig=runconfig, ) register_clf.run_after(train_clf) pipeline = Pipeline( workspace=ws, steps=[ data_prep, train_prednet, register_prednet, batch_scoring, train_clf, register_clf, ], ) pipeline.validate() pipeline_name = "prednet_" + dataset published_pipeline = pipeline.publish(name=pipeline_name) _ = Schedule.create( workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join("prednet/data/raw_data", dataset, "Train"), polling_interval=60 * 24, ) published_pipeline.submit(ws, pipeline_name)
deploy_model = PythonScriptStep(script_name="src/deploy_model.py", compute_target=compute_target, runconfig=aml_run_config) steps = [train_model, deploy_model] step_seq = StepSequence(steps=steps) pipeline = Pipeline(workspace=ws, steps=step_seq) pp = pipeline.publish( name="TitanicDeploymentPipeline", description="Training and deployment pipeline for our titanic API.", version="1.0") # We can also set up a trigger based schedule using the Datastore class - https://docs.microsoft.com/en-us/azure/machine-learning/how-to-schedule-pipelines#create-a-time-based-schedule recurrence = ScheduleRecurrence(frequency="Month", interval=1, start_time='2020-11-01T00:00:00') recurring_schedule = Schedule.create(ws, name="TitanicRetrainingSchedule", description="Once a month training", pipeline_id=pp.id, experiment_name=exp_name, recurrence=recurrence) run = pp.submit(ws, experiment_name=exp_name) run_id = run.id exp = Experiment(ws, exp_name) r = Run(exp, run_id) r.get_details()
allow_reuse=False, hash_paths=["."]) print("pipeline submit step created") submit_pipelines.run_after(build_pipelines) pipeline = Pipeline(workspace=ws, steps=[build_pipelines, submit_pipelines]) print("Pipeline created") pipeline.validate() print("Validation complete") pipeline_name = 'prednet_master' published_pipeline = pipeline.publish(name=pipeline_name) print("pipeline id: ", published_pipeline.id) datastore = ws.get_default_datastore() schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name='Schedule_Run', datastore=datastore, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name) print("Created schedule with id: {}".format(schedule.id)) published_pipeline.submit(ws, published_pipeline.name)