Esempio n. 1
0
    def __init__(self,
                 execution_script,
                 runtime,
                 conda_file=None,
                 docker_file=None,
                 schema_file=None,
                 dependencies=None,
                 enable_gpu=None,
                 tags=None,
                 properties=None,
                 description=None,
                 base_image=None,
                 base_image_registry=None,
                 allow_absolute_path=False,
                 cuda_version=None):
        """Initialize the config object.

        :param execution_script: Path to local file that contains the code to run for the image
        :type execution_script: str
        :param runtime: Which runtime to use for the image. Current supported runtimes are 'spark-py' and 'python'
        :type runtime: str
        :param conda_file: Path to local file containing a conda environment definition to use for the image
        :type conda_file: str
        :param docker_file: Path to local file containing additional Docker steps to run when setting up the image
        :type docker_file: str
        :param schema_file: Path to local file containing a webservice schema to use when the image is deployed
        :type schema_file: str
        :param dependencies: List of paths to additional files/folders that the image needs to run
        :type dependencies: :class:`list[str]`
        :param enable_gpu: Whether or not to enable GPU support in the image. The GPU image must be used on
            Microsoft Azure Services such as Azure Container Instances, Azure Machine Learning Compute,
            Azure Virtual Machines, and Azure Kubernetes Service. Defaults to false.
        :type enable_gpu: bool
        :param tags: Dictionary of key value tags to give this image
        :type tags: dict[str, str]
        :param properties: Dictionary of key value properties to give this image. These properties cannot
            be changed after deployment, however new key value pairs can be added
        :type properties: dict[str, str]
        :param description: A description to give this image
        :type description: str
        :param base_image: A custom image to be used as base image. If no base image is given then the base image
            will be used based off of given runtime parameter.
        :type base_image: str
        :param base_image_registry: Image registry that contains the base image.
        :type base_image_registry: azureml.core.container_registry.ContainerRegistry
        :param allow_absolute_path: Flag to allow the absolute path
        :type allow_absolute_path: bool
        :param cuda_version: Version of CUDA to install for images that need GPU support. The GPU image must be used on
            Microsoft Azure Services such as Azure Container Instances, Azure Machine Learning Compute,
            Azure Virtual Machines, and Azure Kubernetes Service. Supported versions are 9.0, 9.1, and 10.0.
            If 'enable_gpu' is set, this defaults to '9.1'.
        :type cuda_version: str
        :raises: azureml.exceptions.WebserviceException
        """
        warnings.warn(
            "ContainerImageConfig class has been deprecated and will be removed in a future release. "
            + "Please migrate to using Environments. " +
            "https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments",
            category=DeprecationWarning,
            stacklevel=2)

        self.execution_script = execution_script
        self.runtime = runtime
        self.conda_file = conda_file
        self.docker_file = docker_file
        self.schema_file = schema_file
        self.dependencies = dependencies
        self.enable_gpu = enable_gpu
        self.tags = tags
        self.properties = properties
        self.description = description
        self.base_image = base_image
        self.base_image_registry = base_image_registry or ContainerRegistry()
        self.allow_absolute_path = allow_absolute_path
        self.cuda_version = cuda_version

        self.execution_script_path = os.path.abspath(
            os.path.dirname(self.execution_script))
        self.validate_configuration()
Esempio n. 2
0
def launch_experiment(ws, conf_aml, conf_cluster, conf_docker,
                      conf_experiment):

    # Register the input data blob container
    input_ds = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name='petridishdata',
        container_name='datasets',
        account_name='petridishdata',
        account_key=conf_aml['azure_storage_account_key'],
        create_if_not_exists=False)

    output_ds = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name='petridishoutput',
        container_name='amloutput',
        account_name='petridishdata',
        account_key=conf_aml['azure_storage_account_key'],
        create_if_not_exists=False)

    # Create or attach compute cluster
    cluster_name = conf_cluster['cluster_name']

    try:
        compute_target = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing compute target.')
    except:
        print('Creating a new compute target...')
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=conf_cluster['vm_size'],
            max_nodes=conf_cluster['max_nodes'],
            vm_priority=conf_cluster['vm_priority'],
            idle_seconds_before_scaledown=conf_cluster[
                'idle_seconds_before_scaledown'])

        # Create the cluster
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    # use get_status() to get a detailed status for the current cluster.
    print(compute_target.get_status().serialize())

    # Set project directory
    # Assuming running in extract_features_from_videos folder
    project_folder = '../../'

    # Setup custom docker usage
    image_registry_details = ContainerRegistry()
    image_registry_details.address = conf_docker['image_registry_address']
    image_registry_details.username = conf_docker['image_registry_username']
    image_registry_details.password = conf_docker['image_registry_password']

    # don't let the system build a new conda environment
    user_managed_dependencies = True

    # Note that experiment names have to be
    # <36 alphanumeric characters
    exp_name = conf_experiment['experiment_name']

    experiment = Experiment(ws, name=exp_name)
    script_params = {
        '--nas.eval.loader.dataset.dataroot': input_ds.path('/').as_mount(),
        '--nas.search.loader.dataset.dataroot': input_ds.path('/').as_mount(),
        '--common.logdir': output_ds.path('/').as_mount(),
    }

    est = Estimator(source_directory=project_folder,
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='scripts/main.py',
                    custom_docker_image=conf_docker['image_name'],
                    image_registry_details=image_registry_details,
                    user_managed=user_managed_dependencies,
                    source_directory_data_store=input_ds)

    run = experiment.submit(est)
Esempio n. 3
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))

    base_dir = '.'
        
    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)
    
    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
    
    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except:# ComputeTargetException:
        print("creating new compute target")
        
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                                    max_nodes=4,
                                                                    idle_seconds_before_scaledown=1800)    
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
        
    # use get_status() to get a detailed status for the current cluster. 
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except: 
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                                    max_nodes=10,
                                                                    idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout. 
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster. 
    try:
        print(gpu_compute_target.get_status().serialize())
    except BaseException as e:
        print("Could not get status of compute target.")
        print(e)

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
    
    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    # DataReference to where video data is stored.
    video_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="video_data",
        path_on_datastore=os.path.join("prednet", "data", "video", dataset))
    print("DataReference object created")
        
    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py", 
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    print("video_decode step created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name='prepare_data',
        script_name="data_preparation.py", 
        arguments=["--input_data", raw_data, "--output_data", preprocessed_data],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    data_prep.run_after(video_decoding)

    print("data_prep step created")


    # configure access to ACR for pulling our custom docker image
    acr = ContainerRegistry()
    acr.address = config['acr_address']
    acr.username = config['acr_username']
    acr.password = config['acr_password']
    
    est = Estimator(source_directory=script_folder,
                    compute_target=gpu_compute_target,
                    entry_script='train.py', 
                    use_gpu=True,
                    node_count=1,
                    custom_docker_image = "wopauli_1.8-gpu:1",
                    image_registry_details=acr,
                    user_managed=True
                    )

    ps = RandomParameterSampling(
        {
            '--batch_size': choice(1, 2, 4, 8),
            '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
            '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
            '--learning_rate': loguniform(-6, -1),
            '--lr_decay': loguniform(-9, -1),
            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
            '--transfer_learning': choice("True", "False")
        }
    )

    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10)

    hdc = HyperDriveConfig(estimator=est, 
                            hyperparameter_sampling=ps, 
                            policy=policy, 
                            primary_metric_name='val_loss', 
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                            max_total_runs=10,
                            max_concurrent_runs=5, 
                            max_duration_minutes=60*6
                            )

    hd_step = HyperDriveStep(
        name="train_w_hyperdrive",
        hyperdrive_run_config=hdc,
        estimator_entry_script_arguments=[
            '--data-folder', preprocessed_data, 
            '--remote_execution',
            '--dataset', dataset
            ],
        inputs=[preprocessed_data],
        metrics_output = data_metrics,
        allow_reuse=True
    )
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=cpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.']
    )
    registration_step.run_after(hd_step)

    pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step])
    print ("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete") 

    pipeline_name = 'prednet_' + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)
    

    schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch",
                            pipeline_id=published_pipeline.id, 
                            experiment_name=pipeline_name,
                            datastore=def_blob_store,
                            wait_for_provisioning=True,
                            description="Datastore scheduler for Pipeline" + pipeline_name,
                            path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'),
                            polling_interval=1
                            )

    return pipeline_name
Esempio n. 4
0
load_dotenv()

# Azure Subscription related information
azure_tentant_id = os.environ.get('AZURE_TENTANT_ID')
azure_subscription_id = os.environ.get('AZURE_SUBSCRIPTION_ID')
azure_app_id = os.environ.get('AZURE_APP_ID')
azure_app_secret = os.environ.get('AZURE_APP_SECRET')

# Azure Machine Learning Service related information
azure_resource_group = 'rg-aml-r-workloads'
aml_workspace_name = 'mlwks-r-workloads'
aml_experiment_name = 'experimenthellor'
aml_compute_target = 'defaultcompute'

# Azure Container Registry related information
acr_details = ContainerRegistry()
acr_details.address = os.environ.get('ACR_ADDRESS')
acr_details.username = os.environ.get('ACR_USERNAME')
acr_details.password = os.environ.get('ACR_PASSWORD')
acr_image = 'aml-r'

# R Script related information
r_script = 'hello.r'

#   1. Authenticate with Azure ML Service
auth = ServicePrincipalAuthentication(
    tenant_id=azure_tentant_id,
    service_principal_id=azure_app_id,
    service_principal_password=azure_app_secret)

aml_workspace = Workspace.get(name=aml_workspace_name,
Esempio n. 5
0
registry_details = None

acr = re.match('^((\w+).azurecr.io)/(.*)', args.container)
if acr:
    # Extract the relevant parts from the container image
    #   e.g. onnxtraining.azurecr.io/azureml/bert:latest
    registry_address = acr.group(1)  # onnxtraining.azurecr.io
    registry_name = acr.group(2)  # onnxtraining
    container_image = acr.group(3)  # azureml/bert:latest

    registry_client = get_client_from_cli_profile(
        ContainerRegistryManagementClient, subscription_id=args.subscription)
    registry_credentials = registry_client.registries.list_credentials(
        args.container_registry_resource_group, registry_name)

    registry_details = ContainerRegistry()
    registry_details.address = registry_address
    registry_details.username = registry_credentials.username
    registry_details.password = registry_credentials.passwords[0].value

# MPI configuration if executing a distributed run
mpi = MpiConfiguration()
mpi.process_count_per_node = args.gpu_count

# AzureML Estimator that describes how to run the Experiment
estimator = Estimator(source_directory='./',
                      script_params=script_params,
                      compute_target=compute_target,
                      node_count=args.node_count,
                      distributed_training=mpi,
                      image_registry_details=registry_details,
Esempio n. 6
0
    def __init__(self, config_filepath: str) -> None:

        # read in config
        self.conf = Config(config_filepath)

        # config region
        self.conf_aml = self.conf['aml_config']
        self.conf_storage = self.conf['storage']
        self.conf_cluster = self.conf['cluster_config']
        self.conf_docker = self.conf['azure_docker']
        self.conf_experiment = self.conf['experiment']
        # end region

        # initialize workspace
        self.ws = Workspace.from_config(path=self.conf_aml['aml_config_file'])
        print('Workspace name: ' + self.ws.name,
              'Azure region: ' + self.ws.location,
              'Subscription id: ' + self.ws.subscription_id,
              'Resource group: ' + self.ws.resource_group,
              sep='\n')

        # register blobs
        # TODO: make blob registration more flexible
        self.input_ds = Datastore.register_azure_blob_container(
            workspace=self.ws,
            datastore_name=self.conf_storage['input_datastore_name'],
            container_name=self.conf_storage['input_container_name'],
            account_name=self.conf_storage['input_azure_storage_account_name'],
            account_key=self.conf_storage['input_azure_storage_account_key'],
            create_if_not_exists=False)

        self.output_ds = Datastore.register_azure_blob_container(
            workspace=self.ws,
            datastore_name=self.conf_storage['output_datastore_name'],
            container_name=self.conf_storage['output_container_name'],
            account_name=self.
            conf_storage['output_azure_storage_account_name'],
            account_key=self.conf_storage['output_azure_storage_account_key'],
            create_if_not_exists=False)

        # create compute cluster
        try:
            self.compute_target = ComputeTarget(
                workspace=self.ws, name=self.conf_cluster['cluster_name'])
            print(self.compute_target.get_status().serialize())
        except Exception as e:
            print('Encountered error trying to get the compute target')
            print(f'Exception was {e}')
            sys.exit(1)

        self.project_folder = self.conf_experiment['project_folder']

        # setup custom docker usage
        self.image_registry_details = ContainerRegistry()
        self.image_registry_details.address = self.conf_docker[
            'image_registry_address']
        self.image_registry_details.username = self.conf_docker[
            'image_registry_username']
        self.image_registry_details.password = self.conf_docker[
            'image_registry_password']

        self.user_managed_dependencies = True