def __init__(self, execution_script, runtime, conda_file=None, docker_file=None, schema_file=None, dependencies=None, enable_gpu=None, tags=None, properties=None, description=None, base_image=None, base_image_registry=None, allow_absolute_path=False, cuda_version=None): """Initialize the config object. :param execution_script: Path to local file that contains the code to run for the image :type execution_script: str :param runtime: Which runtime to use for the image. Current supported runtimes are 'spark-py' and 'python' :type runtime: str :param conda_file: Path to local file containing a conda environment definition to use for the image :type conda_file: str :param docker_file: Path to local file containing additional Docker steps to run when setting up the image :type docker_file: str :param schema_file: Path to local file containing a webservice schema to use when the image is deployed :type schema_file: str :param dependencies: List of paths to additional files/folders that the image needs to run :type dependencies: :class:`list[str]` :param enable_gpu: Whether or not to enable GPU support in the image. The GPU image must be used on Microsoft Azure Services such as Azure Container Instances, Azure Machine Learning Compute, Azure Virtual Machines, and Azure Kubernetes Service. Defaults to false. :type enable_gpu: bool :param tags: Dictionary of key value tags to give this image :type tags: dict[str, str] :param properties: Dictionary of key value properties to give this image. These properties cannot be changed after deployment, however new key value pairs can be added :type properties: dict[str, str] :param description: A description to give this image :type description: str :param base_image: A custom image to be used as base image. If no base image is given then the base image will be used based off of given runtime parameter. :type base_image: str :param base_image_registry: Image registry that contains the base image. :type base_image_registry: azureml.core.container_registry.ContainerRegistry :param allow_absolute_path: Flag to allow the absolute path :type allow_absolute_path: bool :param cuda_version: Version of CUDA to install for images that need GPU support. The GPU image must be used on Microsoft Azure Services such as Azure Container Instances, Azure Machine Learning Compute, Azure Virtual Machines, and Azure Kubernetes Service. Supported versions are 9.0, 9.1, and 10.0. If 'enable_gpu' is set, this defaults to '9.1'. :type cuda_version: str :raises: azureml.exceptions.WebserviceException """ warnings.warn( "ContainerImageConfig class has been deprecated and will be removed in a future release. " + "Please migrate to using Environments. " + "https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments", category=DeprecationWarning, stacklevel=2) self.execution_script = execution_script self.runtime = runtime self.conda_file = conda_file self.docker_file = docker_file self.schema_file = schema_file self.dependencies = dependencies self.enable_gpu = enable_gpu self.tags = tags self.properties = properties self.description = description self.base_image = base_image self.base_image_registry = base_image_registry or ContainerRegistry() self.allow_absolute_path = allow_absolute_path self.cuda_version = cuda_version self.execution_script_path = os.path.abspath( os.path.dirname(self.execution_script)) self.validate_configuration()
def launch_experiment(ws, conf_aml, conf_cluster, conf_docker, conf_experiment): # Register the input data blob container input_ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name='petridishdata', container_name='datasets', account_name='petridishdata', account_key=conf_aml['azure_storage_account_key'], create_if_not_exists=False) output_ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name='petridishoutput', container_name='amloutput', account_name='petridishdata', account_key=conf_aml['azure_storage_account_key'], create_if_not_exists=False) # Create or attach compute cluster cluster_name = conf_cluster['cluster_name'] try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target.') except: print('Creating a new compute target...') compute_config = AmlCompute.provisioning_configuration( vm_size=conf_cluster['vm_size'], max_nodes=conf_cluster['max_nodes'], vm_priority=conf_cluster['vm_priority'], idle_seconds_before_scaledown=conf_cluster[ 'idle_seconds_before_scaledown']) # Create the cluster compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True) # use get_status() to get a detailed status for the current cluster. print(compute_target.get_status().serialize()) # Set project directory # Assuming running in extract_features_from_videos folder project_folder = '../../' # Setup custom docker usage image_registry_details = ContainerRegistry() image_registry_details.address = conf_docker['image_registry_address'] image_registry_details.username = conf_docker['image_registry_username'] image_registry_details.password = conf_docker['image_registry_password'] # don't let the system build a new conda environment user_managed_dependencies = True # Note that experiment names have to be # <36 alphanumeric characters exp_name = conf_experiment['experiment_name'] experiment = Experiment(ws, name=exp_name) script_params = { '--nas.eval.loader.dataset.dataroot': input_ds.path('/').as_mount(), '--nas.search.loader.dataset.dataroot': input_ds.path('/').as_mount(), '--common.logdir': output_ds.path('/').as_mount(), } est = Estimator(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='scripts/main.py', custom_docker_image=conf_docker['image_name'], image_registry_details=image_registry_details, user_managed=user_managed_dependencies, source_directory_data_store=input_ds) run = experiment.submit(est)
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
load_dotenv() # Azure Subscription related information azure_tentant_id = os.environ.get('AZURE_TENTANT_ID') azure_subscription_id = os.environ.get('AZURE_SUBSCRIPTION_ID') azure_app_id = os.environ.get('AZURE_APP_ID') azure_app_secret = os.environ.get('AZURE_APP_SECRET') # Azure Machine Learning Service related information azure_resource_group = 'rg-aml-r-workloads' aml_workspace_name = 'mlwks-r-workloads' aml_experiment_name = 'experimenthellor' aml_compute_target = 'defaultcompute' # Azure Container Registry related information acr_details = ContainerRegistry() acr_details.address = os.environ.get('ACR_ADDRESS') acr_details.username = os.environ.get('ACR_USERNAME') acr_details.password = os.environ.get('ACR_PASSWORD') acr_image = 'aml-r' # R Script related information r_script = 'hello.r' # 1. Authenticate with Azure ML Service auth = ServicePrincipalAuthentication( tenant_id=azure_tentant_id, service_principal_id=azure_app_id, service_principal_password=azure_app_secret) aml_workspace = Workspace.get(name=aml_workspace_name,
registry_details = None acr = re.match('^((\w+).azurecr.io)/(.*)', args.container) if acr: # Extract the relevant parts from the container image # e.g. onnxtraining.azurecr.io/azureml/bert:latest registry_address = acr.group(1) # onnxtraining.azurecr.io registry_name = acr.group(2) # onnxtraining container_image = acr.group(3) # azureml/bert:latest registry_client = get_client_from_cli_profile( ContainerRegistryManagementClient, subscription_id=args.subscription) registry_credentials = registry_client.registries.list_credentials( args.container_registry_resource_group, registry_name) registry_details = ContainerRegistry() registry_details.address = registry_address registry_details.username = registry_credentials.username registry_details.password = registry_credentials.passwords[0].value # MPI configuration if executing a distributed run mpi = MpiConfiguration() mpi.process_count_per_node = args.gpu_count # AzureML Estimator that describes how to run the Experiment estimator = Estimator(source_directory='./', script_params=script_params, compute_target=compute_target, node_count=args.node_count, distributed_training=mpi, image_registry_details=registry_details,
def __init__(self, config_filepath: str) -> None: # read in config self.conf = Config(config_filepath) # config region self.conf_aml = self.conf['aml_config'] self.conf_storage = self.conf['storage'] self.conf_cluster = self.conf['cluster_config'] self.conf_docker = self.conf['azure_docker'] self.conf_experiment = self.conf['experiment'] # end region # initialize workspace self.ws = Workspace.from_config(path=self.conf_aml['aml_config_file']) print('Workspace name: ' + self.ws.name, 'Azure region: ' + self.ws.location, 'Subscription id: ' + self.ws.subscription_id, 'Resource group: ' + self.ws.resource_group, sep='\n') # register blobs # TODO: make blob registration more flexible self.input_ds = Datastore.register_azure_blob_container( workspace=self.ws, datastore_name=self.conf_storage['input_datastore_name'], container_name=self.conf_storage['input_container_name'], account_name=self.conf_storage['input_azure_storage_account_name'], account_key=self.conf_storage['input_azure_storage_account_key'], create_if_not_exists=False) self.output_ds = Datastore.register_azure_blob_container( workspace=self.ws, datastore_name=self.conf_storage['output_datastore_name'], container_name=self.conf_storage['output_container_name'], account_name=self. conf_storage['output_azure_storage_account_name'], account_key=self.conf_storage['output_azure_storage_account_key'], create_if_not_exists=False) # create compute cluster try: self.compute_target = ComputeTarget( workspace=self.ws, name=self.conf_cluster['cluster_name']) print(self.compute_target.get_status().serialize()) except Exception as e: print('Encountered error trying to get the compute target') print(f'Exception was {e}') sys.exit(1) self.project_folder = self.conf_experiment['project_folder'] # setup custom docker usage self.image_registry_details = ContainerRegistry() self.image_registry_details.address = self.conf_docker[ 'image_registry_address'] self.image_registry_details.username = self.conf_docker[ 'image_registry_username'] self.image_registry_details.password = self.conf_docker[ 'image_registry_password'] self.user_managed_dependencies = True