예제 #1
0
    def __init__(self,
                 workspace,
                 name,
                 datastore_type,
                 container_name,
                 account_name,
                 sas_token=None,
                 account_key=None,
                 protocol=None,
                 endpoint=None):
        """Class AbstractAzureStorageDatastore constructor.

        :param workspace: The workspace this datastore belongs to.
        :type workspace: azureml.core.workspace.Workspace
        :param name: The name of the datastore. It can only contain alphanumeric
            characters or - or _.
        :type name: str
        :param datastore_type: The type of this datastore, either "AzureBlob" or "AzureFile".
        :type datastore_type: str
        :param container_name: The container name.
        :type container_name: str
        :param account_name: The storage account name.
        :type account_name: str
        :param sas_token: The SAS token for accessing this container, defaults to None.
        :type sas_token: str, optional
        :param account_key: The storage account key, defaults to None.
        :type account_key: str, optional
        :param protocol: The protocol to use to connect to the storage account.
            If None, defaults to https.
        :type protocol: str, optional
        :param endpoint: The endpoint of the blob container. If None, defaults to core.windows.net.
        :type endpoint: str, optional
        """
        super(AbstractAzureStorageDatastore,
              self).__init__(workspace, name, datastore_type)
        self.container_name = container_name
        self.account_name = account_name
        self.sas_token = sas_token
        self.account_key = account_key
        self.credential_type = 'None'
        self.protocol = protocol
        self.endpoint = endpoint

        if account_key:
            self.credential_type = 'AccountKey'
        if sas_token:
            self.credential_type = 'Sas'

        self._num_workers = 32

        self._data_reference = DataReference(datastore=self)
예제 #2
0
 def get_by_data_reference(cls, workspace, path):
     data_store = Datastore(workspace, cls.DEFAULT_GLOBAL_DATASET_STORE)
     return DataReference(
         datastore=data_store,
         data_reference_name=cls.DEFAULT_DATA_REFERENCE_NAME,
         path_on_datastore=path,
     )
예제 #3
0
    def upload(self,
               src_dir,
               target_path=None,
               overwrite=False,
               show_progress=True):
        """Upload the data from the local file system to blob container this data store points to.

        :param src_dir: The local directory to upload.
        :type src_dir: str
        :param target_path: The location in blob container to upload to. If None, then upload to
            root. Defaults to None.
        :type target_path: str
        :param overwrite: Indicates whether to overwrite existing files. Defaults to False.
        :type overwrite: bool, optional
        :param show_progress: Indicates whether to show progress of the upload in the console.
            Defaults to True.
        :type show_progress: bool, optional
        :return: The DataReference instance for the target path uploaded.
        :rtype: azureml.data.data_reference.DataReference
        """
        module_logger.info("Called AzureBlobDatastore.upload")
        self._ensure_credential("Upload")
        target_path = target_path or ""
        count = self._start_upload_task(
            self._get_upload_from_dir(src_dir, target_path), overwrite,
            lambda target_file_path: self.blob_service.exists(
                self.container_name, target_file_path), show_progress,
            lambda target, source: lambda: self.blob_service.
            create_blob_from_path(self.container_name, target, source))
        module_logger.info(
            "Finished AzureBlobDatastore.upload with count={0}.".format(count))
        return DataReference(datastore=self, path_on_datastore=target_path)
예제 #4
0
    def upload(self,
               src_dir,
               target_path=None,
               overwrite=False,
               show_progress=True):
        """Upload the data from the local file system to the file share this datastore points to.

        :param src_dir: The local directory to upload.
        :type src_dir: str
        :param target_path: The location in file share to upload to. If None then upload to root.
        :type target_path: str
        :param overwrite: Indicates whether to overwrite existing files.
        :type overwrite: bool, optional
        :param show_progress: Indicates whether to show the progress of upload in the console.
        :type show_progress: bool, optional
        :return: The DataReference instance for the target path uploaded.
        :rtype: azureml.data.data_reference.DataReference
        """
        module_logger.info("Called AzureFileDatastore.upload")
        target_path = target_path or ""
        count = self._start_upload_task(
            self._get_upload_from_dir(src_dir, target_path), overwrite,
            lambda target_file_path: self.file_service.exists(
                self.container_name,
                os.path.split(target_file_path)[0],
                os.path.split(target_file_path)[1]), show_progress,
            self._file_share_upload)
        module_logger.info(
            "Finished AzureFileDatastore.upload with count={0}.".format(count))
        return DataReference(datastore=self, path_on_datastore=target_path)
예제 #5
0
    def _get_data_references(self, request_id, internal_datastore):
        print(
            'AMLCompute, _get_data_references() called. Request ID: {}'.format(
                request_id))
        # Argument Datastore Name needs to: only contain alphanumeric characters and _.
        request_id_to_use_for_datastore = request_id.replace('-', '_')
        try:
            # setting the overwrite flag to True overwrites any datastore that was created previously with that name

            # internal_datastore stores all user-facing files: list of images, detection results, list of failed images
            # and it so happens that each job also needs the list of images as an input
            internal_datastore_name = 'internal_datastore_{}'.format(
                request_id_to_use_for_datastore)
            internal_account_name = internal_datastore['account_name']
            internal_account_key = internal_datastore['account_key']
            internal_container_name = internal_datastore['container_name']
            internal_datastore = Datastore.register_azure_blob_container(
                self.ws,
                internal_datastore_name,
                internal_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('internal_datastore done')

            # output_datastore stores the output from score.py in each job, which is another container
            # in the same storage account as interl_datastore
            output_datastore_name = 'output_datastore_{}'.format(
                request_id_to_use_for_datastore)
            output_container_name = api_config.AML_CONTAINER
            output_datastore = Datastore.register_azure_blob_container(
                self.ws,
                output_datastore_name,
                output_container_name,
                internal_account_name,
                account_key=internal_account_key)
            print('output_datastore done')

        except Exception as e:
            raise RuntimeError(
                'Error in connecting to the datastores for AML Compute: {}'.
                format(str(e)))

        try:
            internal_dir = DataReference(datastore=internal_datastore,
                                         data_reference_name='internal_dir',
                                         mode='mount')

            output_dir = PipelineData(
                'output_{}'.format(request_id_to_use_for_datastore),
                datastore=output_datastore,
                output_mode='mount')
            print('Finished setting up the Data References.')
        except Exception as e:
            raise RuntimeError(
                'Error in creating data references for AML Compute: {}.'.
                format(str(e)))

        return internal_dir, output_dir
    def _setup_datareference(self, name, path):
        """
        helper function to setup a datareference object in AzureML.

        :param str name: [required] name of the data reference\
        :param str path: [required] path on the datastore where the data lives.
        :returns: input_data
        :rtype: DataReference
        """
        input_data = DataReference(datastore=self.blob_ds,
                                   data_reference_name=name,
                                   path_on_datastore=path)
        return input_data
예제 #7
0
    def upload_files(self,
                     files,
                     relative_root=None,
                     target_path=None,
                     overwrite=False,
                     show_progress=True):
        """Upload the data from the local file system to the blob container this datastore points to.

        :param files: A list of absolute paths of files to upload.
        :type files: builtin.list[str]
        :param relative_root: The root used to determine the path
            of the files in the blob. For example, if we upload /path/to/file.txt, and we define
            base path to be /path, when file.txt is uploaded to the blob storage, it will have
            the path of /to/file.txt. If target_path is also given, then it will be used as
            the prefix for the derived path from above. The base path must be a common path of
            all of the files, otherwise an exception will be thrown. Defaults to None, which will find
            the common path.
        :type relative_root: str, optional
        :param target_path: The location in the blob container to upload the data to.
            Defaults to None, the root.
        :type target_path: str, optional
        :param overwrite: Indicates whether to overwrite existing files. Defaults to False.
        :type overwrite: bool, optional
        :param show_progress: Indicates whether to show progress of the upload in the console.
            Defaults to True.
        :type show_progress: bool, optional
        :return: The DataReference instance for the target path uploaded.
        :rtype: azureml.data.data_reference.DataReference
        """
        module_logger.info("Called AzureBlobDatastore.upload_files")
        target_path = target_path or ""
        relative_root = relative_root or common_path(files)
        count = self._start_upload_task(
            self._get_upload_from_files(files, target_path, relative_root,
                                        False), overwrite,
            lambda target_file_path: self.blob_service.exists(
                self.container_name, target_file_path), show_progress,
            lambda target, source: lambda: self.blob_service.
            create_blob_from_path(self.container_name, target, source))
        module_logger.info(
            "Finished AzureBlobDatastore.upload with count={0}.".format(count))
        return DataReference(datastore=self, path_on_datastore=target_path)
예제 #8
0
def createDataReference(workspace, storage_name, storage_key,
                        storage_container_name, data_store_name,
                        data_reference_name):
    '''
        If no present, registers a new azureml.core.datastore.Datastore
        Once the data store is in hand it creates an instance of azureml.data.data_reference.DataReference that 
        can be used in an Azure ML pipeline step. 

        PARAMS: 
            workspace               : azureml.core.Workspace    : Existing AMLS Workspace
            storage_name            : string                    : Name of the Azure Storage Account
            storage_key             : string                    : Access Key to the Azure Storage Account
            storage_container_name  : string                    : Container name to recieve blobs. Must exist
            data_store_name         : string                    : Name of the registere data store.
            data_reference_name     : string                    : Name of the data reference

        RETURNS: 
            tuple(azureml.core.datastore.Datastore, azureml.data.data_reference.DataReference)

    '''
    data_store = None

    try:
        data_store = Datastore.get(workspace, data_store_name)
        print("Found existing data store - ", data_store_name)
    except Exception as ex:
        print("Creating data store - ", data_store_name)

        data_store = Datastore.register_azure_blob_container(
            workspace,
            datastore_name=data_store_name,
            container_name=storage_container_name,
            account_name=storage_name,
            account_key=storage_key,
        )

    if data_store == None:
        raise Exception("Could not create/find data store.")

    return data_store, DataReference(datastore=data_store,
                                     data_reference_name=data_reference_name)
예제 #9
0
def upload_files(files,
                 datastore,
                 relative_root=None,
                 target_path=None,
                 overwrite=False,
                 show_progress=True):
    from azureml.data._upload_helper import _start_upload_task, _get_upload_from_files
    from azureml.data.data_reference import DataReference

    target_path = target_path or ""
    _file_exists(dstore=datastore, path=target_path)
    relative_root = relative_root or common_path(files)
    _start_upload_task(
        _get_upload_from_files(files, target_path, relative_root, True),
        overwrite, lambda target_file_path: _file_exists(
            dstore=datastore, path=target_file_path), show_progress, lambda
        target, source: lambda: _upload_file(base_path=relative_root,
                                             local_file_path=source,
                                             remote_target_path=target_path,
                                             datastore=datastore,
                                             overwrite=overwrite))
    return DataReference(datastore=datastore, path_on_datastore=target_path)
예제 #10
0
def upload_dir(src_dir,
               remote_target_path,
               datastore,
               overwrite=False,
               show_progress=True):
    from azureml.data._upload_helper import _start_upload_task, _get_upload_from_dir
    from azureml.data.data_reference import DataReference

    remote_target_path = remote_target_path or ""
    _file_exists(dstore=datastore, path=remote_target_path)
    _start_upload_task(
        _get_upload_from_dir(src_dir, remote_target_path), overwrite,
        lambda target_file_path: _file_exists(dstore=datastore,
                                              path=target_file_path),
        show_progress, lambda target, source: lambda: _upload_file(
            base_path=src_dir,
            local_file_path=source,
            remote_target_path=remote_target_path,
            datastore=datastore,
            overwrite=overwrite))
    return DataReference(datastore=datastore,
                         path_on_datastore=remote_target_path)
예제 #11
0
    def my_azure_app(cfg: DictConfig) -> None:
        print(cfg.pretty())
        args_dict = OmegaConf.to_container(cfg, resolve=False)

        yaml_file_nm = args_dict["yaml_file"].split("/")[-1].split(".")[0]
        conf_file = os.path.join(
            args_dict["root_path"],
            yaml_file_nm + "_" + str(datetime.datetime.now()) + ".json",
        )
        print(conf_file)

        with open(conf_file, "w") as out:
            out.write(json.dumps(args_dict))

        # First, list the supported VM families for Azure Machine Learning Compute
        # ws = Workspace.get('experiments')
        cluster_name = "gpucluster"
        experiment_name = args_dict["experiment_name"] + "_azure"
        disable_gpu = args_dict["disable_gpu"]
        script_folder = "."  # todo. this is overriden by hydra
        script_folder = (hydra.utils.get_original_cwd()
                         )  # todo. this is overriden by hydra
        data_path = os.path.join(args_dict["root_path"],
                                 args_dict["data_subdir"])

        sub_id = os.getenv("AZ_SUBS_ID")

        assert sub_id is not None
        # Edit a run configuration property on the fly.
        run_local = RunConfiguration()
        run_local.environment.python.user_managed_dependencies = True

        ws = Workspace.get(
            name="experiments",
            subscription_id=sub_id,
            resource_group="default_resource_group",
        )

        # print(AmlCompute.supported_vmsizes(workspace=ws))

        # Create a new runconfig object
        _ = RunConfiguration()

        # Signal that you want to use AmlCompute to execute the script
        # run_temp_compute.target = "amlcompute"

        # AmlCompute is created in the same region as your workspace
        # Set the VM size for AmlCompute from the list of supported_vmsizes

        try:
            compute_target = ComputeTarget(workspace=ws, name=cluster_name)
            print("Found existing compute target")
        except ComputeTargetException:
            print("Creating a new compute target...")
            compute_config = AmlCompute.provisioning_configuration(
                vm_size=args_dict["vm_size"], max_nodes=1)

            compute_target = ComputeTarget.create(ws, cluster_name,
                                                  compute_config)
            compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=10)

        s = ws.get_default_datastore()

        # A reference to the root_path in azure after uplaoding
        _ = s.upload(
            src_dir=data_path,
            target_path=data_path,
            overwrite=False,
            show_progress=True,
        )

        # All path except file_name
        # script_target_path = "/".join(args_dict['yaml_file'].split("/")[:-1])
        script_target_path = "/".join(
            conf_file.split("/")[:-1])  # All path except file_name
        print(script_target_path)
        # script_fname = args.config_file.split("/")[-1]
        script_fname = conf_file.split("/")[-1]
        print(script_fname)
        print("---" * 100)

        azure_script_path = s.upload_files(
            files=[conf_file],
            target_path=script_target_path,
            overwrite=True,
            show_progress=True,
        )

        print(azure_script_path)

        azure_script_abs_path = DataReference(datastore=s,
                                              data_reference_name="input_data",
                                              path_on_datastore=conf_file)

        azure_root_path = DataReference(
            datastore=s,
            data_reference_name="root_data",
            path_on_datastore=args_dict["root_path"],
        )

        exp = Experiment(workspace=ws, name=experiment_name)

        # src = ScriptRunConfig(source_directory = script_folder,
        # script = 'run.py', arguments=['--config_file', 'local/pairs.json'],
        # run_config = run_temp_compute)

        # Using pytorch estimator - proper way to submit pytorch jobs
        script_params = {
            "--config_file": azure_script_abs_path,
            "--root_path": azure_root_path,
            "--experiment_name": experiment_name,
        }

        print("GPU Disabled: {}".format(disable_gpu))

        estimator = PyTorch(
            source_directory=script_folder,
            script_params=script_params,
            compute_target=compute_target,
            entry_script="run.py",
            use_gpu=not disable_gpu,
            pip_packages=["pillow==5.4.1"],
        )

        # you can name this as run
        _ = exp.submit(estimator)
예제 #12
0
cpu_compute_target.wait_for_completion(show_output=True)

# Create GPU compute target
print('Creating GPU compute target ...')
gpu_cluster_name = 'k80cluster'
gpu_compute_config = AmlCompute.provisioning_configuration(
    vm_size='Standard_NC6',
    idle_seconds_before_scaledown=1200,
    min_nodes=0,
    max_nodes=2)
gpu_compute_target = ComputeTarget.create(workspace, gpu_cluster_name,
                                          gpu_compute_config)
gpu_compute_target.wait_for_completion(show_output=True)

# Get datastore reference
datastore = DataReference(datastore, mode='mount')

# Step 1: Data ingestion
data_ingestion_step, data_ingestion_outputs = data_ingestion_step(
    datastore, cpu_compute_target)

# Step 2: Data preprocessing
data_preprocess_step, data_preprocess_outputs = data_preprocess_step(
    data_ingestion_outputs['raw_data_dir'], cpu_compute_target)

# Step 3: Train Model
train_step, train_outputs = train_step(data_preprocess_outputs['train_dir'],
                                       data_preprocess_outputs['valid_dir'],
                                       gpu_compute_target)

# Step 4: Evaluate Model
예제 #13
0
    def partition_by(self,
                     partition_keys,
                     target,
                     name=None,
                     show_progress=True,
                     partition_as_file_dataset=False):
        """Partitioned data will be copied and output to the destination specified by target.

        create the dataset from the outputted data path with partition format, register dataset if name is provided,
        return the dataset for the new data path with partitions

        .. code-block:: python

            ds = Dataset.get_by_name('test') # indexed by country, state, partition_date

            # #1: call partition_by locally
            new_ds = ds.partition_by(name="repartitioned_ds", partition_keys=['country'],
                        target=DataPath(datastore, "repartition"))
            partition_keys = newds.partition_keys # ['country']

            # new_ds can be passed to PRS as input dataset

        :param partition_keys: Required, partition keys
        :type partition_keys: builtin.list[str]
        :param target: Required, the datastore path where the dataframe parquet data will be uploaded to.
            A guid folder will be generated under the target path to avoid conflict.
        :type target: azureml.data.datapath.DataPath, azureml.core.datastore.Datastore
            or tuple(azureml.core.datastore.Datastore, str) object
        :param name: Optional, The registration name.
        :type name: str
        :param show_progress: Optional, indicates whether to show progress of the upload in the console.
            Defaults to be True.
        :type show_progress: bool
        :param partition_as_file_dataset: Optional, indicates whether returns a filedataset or not.
            Defaults to be False.
        :type show_progress: bool
        :return: The saved or registered dataset.
        :rtype: azureml.data.TabularDataset
        """
        from uuid import uuid4
        from azureml.exceptions import UserErrorException
        from azureml.core import Dataset
        from azureml.data.data_reference import DataReference
        from azureml.data._dataset_factory_helper import get_progress_logger, parse_target
        from azureml.dataprep import FieldType
        from azureml.data.dataset_factory import TabularDatasetFactory

        import time
        starting_time = time.process_time()

        console = get_progress_logger(show_progress)
        console("Validating arguments.")
        if len(partition_keys) == 0:
            raise UserErrorException("partition_keys cannot be empty")

        column_types = self._dataflow.dtypes
        invalid_keys = []
        for key in partition_keys:
            if key not in column_types:
                invalid_keys.append(key)
        if len(invalid_keys) != 0:
            raise UserErrorException(
                "{0} are invalid partition keys".format(invalid_keys))

        if len(partition_keys) != len(set(partition_keys)):
            raise UserErrorException("partition_keys cannot have duplicates")
        console("Arguments validated.")

        guid = uuid4()
        datastore, relative_path = parse_target(target)
        relative_path_with_guid = "/%s/%s/" % (relative_path, guid)

        partition_format = relative_path_with_guid
        partition_path = relative_path_with_guid
        saved_dataset_key_column_types = {}

        for key in partition_keys:
            if column_types[key] == FieldType.DATE:
                partition_format = partition_format + '{' + key + ':yyyyMMddHHmmss}*/'
                del column_types[key]
            else:
                partition_format = partition_format + '{' + key + '}/'
            partition_path = partition_path + '*/'
            if key in column_types:
                saved_dataset_key_column_types[key] = column_types[key]

        partition_format = partition_format + '*.parquet'
        partition_path = partition_path + '*.parquet'

        console("Uploading file to {}".format(relative_path_with_guid))

        self._dataflow.write_to_parquet(
            partition_keys=partition_keys,
            directory_path=DataReference(datastore=datastore).path(
                relative_path_with_guid)).run_local()
        console("Successfully uploaded file to datastore.")

        console("Creating a new dataset.")
        if partition_as_file_dataset:
            saved_dataset = Dataset.File.\
                from_files(path=(datastore, partition_path), partition_format=partition_format)
        else:
            saved_dataset = TabularDatasetFactory.\
                from_parquet_files(path=(datastore, partition_path), partition_format=partition_format)
        saved_dataset = TabularDataset._create(
            saved_dataset._dataflow.set_column_types(
                saved_dataset_key_column_types),
            self._properties,
            telemetry_info=self._telemetry_info)

        console("Successfully created a new dataset.")

        if self._registration and self._registration.workspace:
            collect_datasets_usage(
                _get_logger(), _PATITION_BY_ACTIVITY, [self],
                self._registration.workspace, "N/A", {
                    "execution_time": time.process_time() - starting_time,
                    "number_of_partition_keys": len(partition_keys)
                })

        if name is None:
            return saved_dataset
        console("registering a new dataset.")
        registered_dataset = saved_dataset.register(datastore.workspace,
                                                    name,
                                                    create_new_version=True)
        console("Successfully created and registered a new dataset.")
        return registered_dataset
service_principal = ServicePrincipalAuthentication(
    tenant_id=tenant_id,
    service_principal_id=application_id,
    service_principal_password=app_secret)

ws = Workspace.get(name=workspace_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group,
                   auth=service_principal)

# Retrieve the pointer to the default Blob storage.
def_blob_store = Datastore(ws, "workspaceblobstore")
print("Blobstore's name: {}".format(def_blob_store.name))

blob_input_data = DataReference(datastore=def_blob_store,
                                data_reference_name="mnist_datainput",
                                path_on_datastore="mnist_datainput")

print("DataReference object created")

# Create a CPU cluster of type D2 V2 with 1 node. (due to subscription's limitations we stick to 1 node)

try:
    compute_target_cpu = ComputeTarget(workspace=ws, name=cluster_name_cpu)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    # CPU: Standard_D3_v2
    # GPU: Standard_NV6
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_D2_V2', max_nodes=1, min_nodes=1)
예제 #15
0
print("Default datastore's name: {}".format(def_file_store.name))
def_blob_store = Datastore(ws, "workspaceblobstore")
print("Blobstore's name: {}".format(def_blob_store.name))


# In[ ]:


# Upload the raw training data to the blob storage
def_blob_store.upload(src_dir=data_location, 
                      target_path='nyc-taxi-raw-features', 
                      overwrite=True, 
                      show_progress=True)

raw_train_data = DataReference(datastore=def_blob_store, 
                                      data_reference_name="nyc_taxi_raw_features", 
                                      path_on_datastore="nyc-taxi-raw-features/nyc-taxi-sample-data.csv")
print("DataReference object created")


# ### Create the Process Training Data Pipeline Step

# The intermediate data (or output of a Step) is represented by PipelineData object. PipelineData can be produced by one step and consumed in another step by providing the PipelineData object as an output of one step and the input of one or more steps.
# 
# The process training data pipeline step takes the raw_train_data DataReference object as input, and it will output an intermediate PipelineData object that holds the processed training data with the new engineered features for datetime components: hour of the day, and day of the week.
# 
# Review and run the cell below to construct the PipelineData objects and the PythonScriptStep pipeline step:
# 
# *Open preprocess.py in the local machine and examine the arguments, inputs, and outputs for the script. Note that there is an argument called process_mode to distinguish between processing training data vs test data. Reviewing the Python script file will give you a good sense of why the script argument names used below are important.*

# In[ ]:
예제 #16
0
        "PUT YOUR STORAGE ACCOUNT KEY HERE")  # Storage account key

    try:
        blob_datastore = Datastore.get(ws, blob_datastore_name)
        print("Found Blob Datastore with name: %s" % blob_datastore_name)
    except:
        blob_datastore = Datastore.register_azure_blob_container(
            workspace=ws,
            datastore_name=blob_datastore_name,
            account_name=account_name,  # Storage account name
            container_name=container_name,  # Name of Azure blob container
            account_key=account_key)  # Storage account key
        print("Registered blob datastore with name: %s" % blob_datastore_name)

    blob_data_ref = DataReference(datastore=blob_datastore,
                                  data_reference_name="blob_test_data",
                                  path_on_datastore="testdata")
    csv_path = (blob_datastore, '/creditcard.csv')

    try:
        tab_ds = Dataset.Tabular.from_delimited_files(path=csv_path)
        tab_ds = tab_ds.register(workspace=ws, name='creditcard')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

creditds = ws.datasets['creditcard']
df = creditds.to_pandas_dataframe()

default_ds = ws.get_default_datastore()
                                 path_on_datastore="training_set_labels.csv")
train_labels_path_parameter = PipelineParameter(
    name="train_labels", default_value=train_labels_datapath)
train_labels_path = (train_labels_path_parameter,
                     DataPathComputeBinding(mode="mount"))

test_features_datapath = DataPath(datastore=blobstore,
                                  path_on_datastore="test_set_features.csv")
test_features_path_parameter = PipelineParameter(
    name="test_features", default_value=test_features_datapath)
test_features_path = (test_features_path_parameter,
                      DataPathComputeBinding(mode="mount"))

submission_format_path = DataReference(
    data_reference_name="submission_format",
    datastore=blobstore,
    path_on_datastore="submission_format.csv",
)

submission_path = PipelineData(name="submission", datastore=blobstore)

model_path = PipelineData(name="model", datastore=blobstore)

step = PythonScriptStep(
    script_name="script.py",
    source_directory="script",
    name="flu_shot_learning",
    arguments=[
        train_features_path,
        train_labels_path,
        test_features_path,
예제 #18
0
def build_prednet_pipeline(dataset, ws):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    base_dir = "."

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = "./scripts"
    os.makedirs(script_folder)

    shutil.copytree(os.path.join(base_dir, "models"),
                    os.path.join(base_dir, script_folder, "models"))
    shutil.copy(os.path.join(base_dir, "train.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder)
    shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder)

    cpu_compute_name = args.cpu_compute_name
    cpu_compute_target = AmlCompute(ws, cpu_compute_name)
    print("found existing compute target: %s" % cpu_compute_name)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = args.gpu_compute_name

    gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
    print(gpu_compute_target.get_status().serialize())

    env = Environment.get(ws, "prednet")

    # Runconfigs
    runconfig = RunConfiguration()
    runconfig.environment = env
    print("PipelineData object created")

    # DataReference to where raw data is stored.
    raw_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="raw_data",
        path_on_datastore=os.path.join("prednet", "data", "raw_data"),
    )
    print("DataReference object created")

    # Naming the intermediate data as processed_data and assigning it to the
    # variable processed_data.
    preprocessed_data = PipelineData("preprocessed_data",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store)
    # prednet_path = PipelineData("outputs", datastore=def_blob_store)
    scored_data = PipelineData("scored_data", datastore=def_blob_store)
    model_path = PipelineData("model_path", datastore=def_blob_store)

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name="prepare_data",
        script_name="data_preparation.py",
        arguments=[
            "--raw_data",
            raw_data,
            "--preprocessed_data",
            preprocessed_data,
            "--dataset",
            dataset,
        ],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    # data_prep.run_after(video_decoding)

    print("data_prep step created")

    est = Estimator(
        source_directory=script_folder,
        compute_target=gpu_compute_target,
        entry_script="train.py",
        node_count=1,
        environment_definition=env,
    )

    ps = BayesianParameterSampling({
        "--batch_size":
        choice(1, 2, 4, 10),
        "--filter_sizes":
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        "--stack_sizes":
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),
        "--learning_rate":
        uniform(1e-6, 1e-3),
        "--lr_decay":
        uniform(1e-9, 1e-2),
        "--freeze_layers":
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
        # "--fine_tuning": choice("True", "False"),
    })

    hdc = HyperDriveConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        primary_metric_name="val_loss",
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=3,
        max_concurrent_runs=3,
        max_duration_minutes=60 * 6,
    )

    train_prednet = HyperDriveStep(
        "train_w_hyperdrive",
        hdc,
        estimator_entry_script_arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--remote_execution",
            "--dataset",
            dataset,
        ],
        inputs=[preprocessed_data],
        outputs=[hd_child_cwd],
        metrics_output=data_metrics,
        allow_reuse=True,
    )
    train_prednet.run_after(data_prep)

    register_prednet = PythonScriptStep(
        name="register_prednet",
        script_name="register_prednet.py",
        arguments=[
            "--data_metrics",
            data_metrics,
        ],
        compute_target=cpu_compute_target,
        inputs=[data_metrics, hd_child_cwd],
        source_directory=script_folder,
        allow_reuse=True,
    )
    register_prednet.run_after(train_prednet)

    batch_scoring = PythonScriptStep(
        name="batch_scoring",
        script_name="batch_scoring.py",
        arguments=[
            "--preprocessed_data",
            preprocessed_data,
            "--scored_data",
            scored_data,
            "--dataset",
            dataset,
            # "--prednet_path",
            # prednet_path
        ],
        compute_target=gpu_compute_target,
        inputs=[preprocessed_data],
        outputs=[scored_data],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    batch_scoring.run_after(register_prednet)

    train_clf = PythonScriptStep(
        name="train_clf",
        script_name="train_clf.py",
        arguments=[
            "--preprocessed_data", preprocessed_data, "--scored_data",
            scored_data, "--model_path", model_path
        ],
        compute_target=cpu_compute_target,
        inputs=[preprocessed_data, scored_data],
        outputs=[model_path],
        source_directory=script_folder,
        runconfig=runconfig,
        allow_reuse=True,
    )
    train_clf.run_after(batch_scoring)

    register_clf = PythonScriptStep(
        name="register_clf",
        script_name="register_clf.py",
        arguments=["--model_path", model_path],
        inputs=[model_path],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        allow_reuse=True,
        runconfig=runconfig,
    )
    register_clf.run_after(train_clf)

    pipeline = Pipeline(
        workspace=ws,
        steps=[
            data_prep,
            train_prednet,
            register_prednet,
            batch_scoring,
            train_clf,
            register_clf,
        ],
    )
    pipeline.validate()

    pipeline_name = "prednet_" + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)

    _ = Schedule.create(
        workspace=ws,
        name=pipeline_name + "_sch",
        pipeline_id=published_pipeline.id,
        experiment_name=pipeline_name,
        datastore=def_blob_store,
        wait_for_provisioning=True,
        description="Datastore scheduler for Pipeline" + pipeline_name,
        path_on_datastore=os.path.join("prednet/data/raw_data", dataset,
                                       "Train"),
        polling_interval=60 * 24,
    )

    published_pipeline.submit(ws, pipeline_name)
예제 #19
0
# get the workspace
print("Getting a reference to workspace %s" % workspace_name)
ws = Workspace.get(name=workspace_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group)
experiment = Experiment(workspace=ws, name='automl-diabetes')
aml_compute = AmlCompute(ws, compute_target_name)

# read in the data
print("Getting a reference to default datastore")
datastore = ws.get_default_datastore()

print("Preparing the 'prep data' step")
blob_diabetes_data = DataReference(
    datastore=datastore,
    data_reference_name="diabetes_data",
    path_on_datastore="diabetesdata/diabetes_pima.csv")

# Create a new runconfig object
aml_run_config = RunConfiguration()
aml_run_config.target = aml_compute
aml_run_config.environment.docker.enabled = True
aml_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
aml_run_config.environment.python.user_managed_dependencies = False
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas', 'scikit-learn', 'numpy'],
    pip_packages=[
        'azureml-sdk', 'azureml-dataprep', 'azureml-dataprep[pandas]',
        'azureml-train-automl'
    ],
    pin_sdk_version=False)
def main():
    e = Env()
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    environment = get_environment(
        aml_workspace, e.aml_env_name, create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    run_config.environment.environment_variables["DATASTORE_NAME"] \
        = datastore_name

    dataset_name = e.dataset_name
    file_name = e.file_name
    datastore = Datastore.get(aml_workspace, datastore_name)

    if (dataset_name not in aml_workspace.datasets):
        raise Exception("Could not find dataset at \"%s\"." % dataset_name)
    else:
        dataset = Dataset.get_by_name(aml_workspace, name=dataset_name)
        dataset.download(target_path='.', overwrite=True)
        datastore.upload_files([file_name],
                               target_path=dataset_name,
                               overwrite=True)

    raw_data_file = DataReference(datastore=datastore,
                                  data_reference_name="Raw_Data_File",
                                  path_on_datastore=dataset_name + '/'
                                  + file_name)

    clean_data_file = PipelineParameter(name="clean_data_file",
                                        default_value="/clean_data.csv")
    clean_data_folder = PipelineData("clean_data_folder",
                                     datastore=datastore)

    prepDataStep = PythonScriptStep(name="Prepare Data",
                                    source_directory=e.sources_directory_train,
                                    script_name=e.data_prep_script_path,
                                    arguments=["--raw_data_file",
                                               raw_data_file,
                                               "--clean_data_folder",
                                               clean_data_folder,
                                               "--clean_data_file",
                                               clean_data_file],
                                    inputs=[raw_data_file],
                                    outputs=[clean_data_folder],
                                    compute_target=aml_compute,
                                    allow_reuse=False)

    print("Step Prepare Data created")

    new_model_file = PipelineParameter(name="new_model_file ",
                                       default_value='/' + e.model_name
                                       + '.pkl')
    new_model_folder = PipelineData("new_model_folder", datastore=datastore)
    est = SKLearn(source_directory=e.sources_directory_train,
                  entry_script=e.train_script_path,
                  pip_packages=['azureml-sdk', 'scikit-learn==0.20.3',
                                'azureml-dataprep[pandas,fuse]>=1.1.14'],
                  compute_target=aml_compute)

    trainingStep = EstimatorStep(
        name="Model Training",
        estimator=est,
        estimator_entry_script_arguments=["--clean_data_folder",
                                          clean_data_folder,
                                          "--new_model_folder",
                                          new_model_folder,
                                          "--clean_data_file",
                                          clean_data_file.default_value,
                                          "--new_model_file",
                                          new_model_file.default_value],
        runconfig_pipeline_params=None,
        inputs=[clean_data_folder],
        outputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Train created")

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)

    evaluateStep = PythonScriptStep(
        name="Evaluate Model",
        source_directory=e.sources_directory_train,
        script_name=e.evaluate_script_path,
        arguments=["--model_name", model_name_param],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Evaluate created")

    registerStep = PythonScriptStep(
        name="Register Model",
        source_directory=e.sources_directory_train,
        script_name=e.register_script_path,
        arguments=["--new_model_folder", new_model_folder,
                   "--new_model_file", new_model_file,
                   "--model_name", model_name_param],
        inputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Register created")

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        trainingStep.run_after(prepDataStep)
        evaluateStep.run_after(trainingStep)
        registerStep.run_after(evaluateStep)
    else:
        print("Exclude evaluation step and directly run register step.")
        trainingStep.run_after(prepDataStep)
        registerStep.run_after(trainingStep)

    pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
    pipeline.validate()
    print("Pipeline is built")

    pipeline._set_experiment_name
    published_pipeline = pipeline.publish(
        name=e.pipeline_name,
        description="Predict Employee Retention Model training pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
예제 #21
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))

    base_dir = '.'
        
    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)
    
    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
    
    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except:# ComputeTargetException:
        print("creating new compute target")
        
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                                    max_nodes=4,
                                                                    idle_seconds_before_scaledown=1800)    
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
        
    # use get_status() to get a detailed status for the current cluster. 
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except: 
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                                    max_nodes=10,
                                                                    idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout. 
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster. 
    try:
        print(gpu_compute_target.get_status().serialize())
    except BaseException as e:
        print("Could not get status of compute target.")
        print(e)

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
    
    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    # DataReference to where video data is stored.
    video_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="video_data",
        path_on_datastore=os.path.join("prednet", "data", "video", dataset))
    print("DataReference object created")
        
    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py", 
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    print("video_decode step created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name='prepare_data',
        script_name="data_preparation.py", 
        arguments=["--input_data", raw_data, "--output_data", preprocessed_data],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    data_prep.run_after(video_decoding)

    print("data_prep step created")


    # configure access to ACR for pulling our custom docker image
    acr = ContainerRegistry()
    acr.address = config['acr_address']
    acr.username = config['acr_username']
    acr.password = config['acr_password']
    
    est = Estimator(source_directory=script_folder,
                    compute_target=gpu_compute_target,
                    entry_script='train.py', 
                    use_gpu=True,
                    node_count=1,
                    custom_docker_image = "wopauli_1.8-gpu:1",
                    image_registry_details=acr,
                    user_managed=True
                    )

    ps = RandomParameterSampling(
        {
            '--batch_size': choice(1, 2, 4, 8),
            '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
            '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
            '--learning_rate': loguniform(-6, -1),
            '--lr_decay': loguniform(-9, -1),
            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
            '--transfer_learning': choice("True", "False")
        }
    )

    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10)

    hdc = HyperDriveConfig(estimator=est, 
                            hyperparameter_sampling=ps, 
                            policy=policy, 
                            primary_metric_name='val_loss', 
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                            max_total_runs=10,
                            max_concurrent_runs=5, 
                            max_duration_minutes=60*6
                            )

    hd_step = HyperDriveStep(
        name="train_w_hyperdrive",
        hyperdrive_run_config=hdc,
        estimator_entry_script_arguments=[
            '--data-folder', preprocessed_data, 
            '--remote_execution',
            '--dataset', dataset
            ],
        inputs=[preprocessed_data],
        metrics_output = data_metrics,
        allow_reuse=True
    )
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=cpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.']
    )
    registration_step.run_after(hd_step)

    pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step])
    print ("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete") 

    pipeline_name = 'prednet_' + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)
    

    schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch",
                            pipeline_id=published_pipeline.id, 
                            experiment_name=pipeline_name,
                            datastore=def_blob_store,
                            wait_for_provisioning=True,
                            description="Datastore scheduler for Pipeline" + pipeline_name,
                            path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'),
                            polling_interval=1
                            )

    return pipeline_name
예제 #22
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    hostname = socket.gethostname()
    if hostname == 'wopauliNC6':
        base_dir = '.'
    else:
        base_dir = '.'

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)

    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)

    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except ComputeTargetException:
        print("creating new compute target")

        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_D2_V2',
            max_nodes=4,
            idle_seconds_before_scaledown=1800)
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name,
                                                  provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except ComputeTargetException:
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6',
            max_nodes=5,
            idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name,
                                                  provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout.
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(gpu_compute_target.get_status().serialize())

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"],
                                      pip_packages=[
                                          "azure-storage-blob==1.5.0",
                                          "hickle==3.4.3", "requests==2.21.0",
                                          "sklearn", "pandas==0.24.2",
                                          "azureml-sdk==1.0.21",
                                          "numpy==1.16.2", "pillow==6.0.0"
                                      ])
    gpu_cd = CondaDependencies.create(pip_packages=[
        "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0",
        "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3",
        "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0",
        "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2"
    ])

    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
    gpu_compute_run_config.environment.docker.enabled = True
    gpu_compute_run_config.environment.docker.gpu_support = True
    gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
    gpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    video_data = DataReference(datastore=def_blob_store,
                               data_reference_name="video_data",
                               path_on_datastore=os.path.join(
                                   "prednet", "data", "video", dataset))

    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    print("DataReference object created")

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py",
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.'])
    print("video_decode created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(name='prepare_data',
                                 script_name="data_preparation.py",
                                 arguments=[
                                     "--input_data", raw_data, "--output_data",
                                     preprocessed_data
                                 ],
                                 inputs=[raw_data],
                                 outputs=[preprocessed_data],
                                 compute_target=cpu_compute_target,
                                 source_directory=script_folder,
                                 runconfig=cpu_compute_run_config,
                                 allow_reuse=True,
                                 hash_paths=['.'])
    data_prep.run_after(video_decoding)

    print("data_prep created")

    est = TensorFlow(source_directory=script_folder,
                     compute_target=gpu_compute_target,
                     pip_packages=[
                         'keras==2.0.8', 'theano', 'tensorflow==1.8.0',
                         'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod',
                         'hickle'
                     ],
                     entry_script='train.py',
                     use_gpu=True,
                     node_count=1)

    ps = RandomParameterSampling({
        '--batch_size':
        choice(2, 4, 8, 16),
        '--filter_sizes':
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        '--stack_sizes':
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),  #, "48, 96"),
        '--learning_rate':
        loguniform(-6, -1),
        '--lr_decay':
        loguniform(-9, -1),
        '--freeze_layers':
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2",
               "3"),
        '--transfer_learning':
        choice("True", "False")
    })

    policy = BanditPolicy(evaluation_interval=2,
                          slack_factor=0.1,
                          delay_evaluation=20)

    hdc = HyperDriveRunConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        policy=policy,
        primary_metric_name='val_loss',
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=5,  #100,
        max_concurrent_runs=5,  #10,
        max_duration_minutes=60 * 6)

    hd_step = HyperDriveStep(name="train_w_hyperdrive",
                             hyperdrive_run_config=hdc,
                             estimator_entry_script_arguments=[
                                 '--data-folder', preprocessed_data,
                                 '--remote_execution'
                             ],
                             inputs=[preprocessed_data],
                             metrics_output=data_metrics,
                             allow_reuse=True)
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=gpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.'])
    registration_step.run_after(hd_step)

    pipeline = Pipeline(
        workspace=ws,
        steps=[video_decoding, data_prep, hd_step, registration_step])
    print("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete")

    pipeline_name = 'prednet_' + dataset
    pipeline.publish(name=pipeline_name)

    return pipeline_name
예제 #23
0
# use get_status() to get a detailed status for the current cluster.
try:
    print(gpu_compute_target.get_status().serialize())
except BaseException as e:
    print("Could not get status of compute target.")
    print(e)

print("PipelineData object created")

# This is where data is expected to be found in Azure Blob storage
path_on_datastore = os.path.join("knowledge_distillation", "data")

# DataReference to where is the input dataset stored
labeled_data = DataReference(datastore=def_blob_store,
                             data_reference_name="labeled_data",
                             path_on_datastore=path_on_datastore)
print("DataReference object created")

# Conda dependencies for compute targets
gpu_cd = CondaDependencies.create(conda_packages=['cudatoolkit=10.0.130'],
                                  pip_packages=[
                                      'keras', 'tensorflow', 'tensorflow-gpu',
                                      'matplotlib', 'pillow', 'six', 'numpy',
                                      'azureml-sdk', 'tqdm'
                                  ])

# Runconfig
gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
gpu_compute_run_config.environment.docker.enabled = True
gpu_compute_run_config.environment.docker.gpu_support = True
old_datastore = [ds for ds in ws.datastores if ds == "telemetry"]
if old_datastore:
    old_ds = Datastore.get(ws, "telemetry")
    old_ds.unregister()

telemetry_ds = Datastore.register_azure_blob_container(
    workspace=ws,
    datastore_name='telemetry',
    container_name=args.storage_container,
    account_name=args.storage_account,
    account_key=args.storage_key,
)

input_data = DataReference(
    datastore=telemetry_ds,
    data_reference_name="input_data",
    path_on_datastore=args.storage_path,
)

preprocessing_est = SKLearn(
    source_directory='010-preprocessing',
    compute_target=cpu_cluster,
    entry_script='dataprep.py',
    conda_packages=['pandas'],
    pip_packages=['fastavro'],
)

output = PipelineData("output", datastore=telemetry_ds)
preprocessing_step = EstimatorStep(
    name="Preprocessing_Train",
    estimator=preprocessing_est,
예제 #25
0
def main():
    """
    Builds the Azure ML pipeline for data engineering and model training.
    """
    databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME']
    training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME']
    build_id = os.getenv('BUILD_BUILDID', 0)

    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=os.environ['AML_WORKSPACE_NAME'],
        subscription_id=os.environ['SUBSCRIPTION_ID'],
        resource_group=os.environ['RESOURCE_GROUP'],
    )
    print(aml_workspace)

    # Generate Databricks credentials, see https://aka.ms/databricks-aad
    dbricks_region = aml_workspace.location
    dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0"

    dbricks_client = databricks_client.create(dbricks_api)
    dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group,
                                workspace_name=databricks_workspace_name)
    dbricks_client.ensure_available()

    # Attach Databricks as Azure ML training compute
    dbricks_compute_name = "databricks"
    dbricks_compute = get_databricks_compute(
        aml_workspace,
        dbricks_compute_name,
    )
    if dbricks_compute is None:
        pat_token = dbricks_client.post(
            'token/create',
            json={"comment": "Azure ML Token generated by Build " + build_id
                  })['token_value']
        dbricks_compute = create_databricks_compute(
            aml_workspace,
            databricks_workspace_name,
            dbricks_compute_name,
            pat_token,
        )

    print("dbricks_compute:")
    print(dbricks_compute)

    # Create Databricks instance pool
    pool_name = "azureml_training"
    instance_pool_id = get_instance_pool(dbricks_client, pool_name)
    if not instance_pool_id:
        dbricks_client.post('instance-pools/create',
                            json={
                                "instance_pool_name":
                                pool_name,
                                "node_type_id":
                                "Standard_D3_v2",
                                "idle_instance_autotermination_minutes":
                                10,
                                "preloaded_spark_versions":
                                [DATABRICKS_RUNTIME_VERSION],
                            })
        instance_pool_id = get_instance_pool(dbricks_client, pool_name)

    notebook_folder = f"/Shared/AzureMLDeployed"
    workspace_datastore = Datastore(aml_workspace, "workspaceblobstore")

    # Create a datastore for the training data container
    credentials, subscription = get_azure_cli_credentials()
    storage_client = StorageManagementClient(credentials, subscription)
    training_storage_keys = storage_client.storage_accounts.list_keys(
        aml_workspace.resource_group, training_data_account_name)
    training_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name="trainingdata",
        container_name="trainingdata",
        account_name=training_data_account_name,
        account_key=training_storage_keys.keys[0].value,
    )

    # FEATURE ENGINEERING STEP (DATABRICKS)
    # Create feature engineering pipeline step

    training_data_input = DataReference(datastore=training_datastore,
                                        path_on_datastore="/",
                                        data_reference_name="training")

    feature_eng_output = PipelineData("feature_engineered",
                                      datastore=workspace_datastore)

    notebook_path = upload_notebook(dbricks_client, notebook_folder,
                                    "code/prepare", "feature_engineering")

    training_dataprep_step = DatabricksStep(
        name="FeatureEngineering",
        inputs=[training_data_input],
        outputs=[feature_eng_output],
        spark_version=DATABRICKS_RUNTIME_VERSION,
        instance_pool_id=instance_pool_id,
        num_workers=3,
        notebook_path=notebook_path,
        run_name="FeatureEngineering",
        compute_target=dbricks_compute,
        allow_reuse=True,
    )

    # You can add Azure ML model training tasks using
    #   feature_eng_output as input.
    # ...

    # Create Azure ML Pipeline
    steps = [training_dataprep_step]

    ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    ml_pipeline.validate()
    published_pipeline = ml_pipeline.publish(
        name="Feature Engineering",
        description="Feature engineering pipeline",
        version=build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")

    # When running in Azure DevOps, set AMLPIPELINE_ID variable
    # for AML Pipeline task in next job
    print("Setting Azure DevOps variable")
    print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]"
          f"{published_pipeline.id}")
예제 #26
0
parser = argparse.ArgumentParser()
parser.add_argument("--await_completion", type=bool, default=False)
parser.add_argument("--download_outputs", type=bool, default=False)
args = parser.parse_args()

workspace = Workspace.from_config(auth=AzureCliAuthentication())

# Retrieve datastore/datasets
# retrieve datastore
datastore_name = 'workspaceblobstore'
datastore = Datastore.get(workspace, datastore_name)

# data reference
baseline_profile = DataReference(datastore,
                                 data_reference_name='baselineProfile',
                                 path_on_datastore='baseline_profile',
                                 mode='download',
                                 path_on_compute=None,
                                 overwrite=False)

# data reference
historic_profile = DataReference(datastore,
                                 data_reference_name='historicProfile',
                                 path_on_datastore='historic_profile',
                                 mode='download',
                                 path_on_compute=None,
                                 overwrite=False)

# define data set names
input_name_train_sub = 'newsgroups_raw_subset_train'
input_name_test_sub = 'newsgroups_raw_subset_test'
예제 #27
0
        "azureml-mlflow==1.5.0",
        "azureml-defaults==1.5.0"
    ]
)
env.python.conda_dependencies = cd
env.register(workspace=ws)
print("Registered environment component-condition")

# Specify the run configuration
run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.python.conda_dependencies = cd

# Pipeline definition
inputdata = DataReference(
    datastore=Datastore.get(ws, "trainingdata"),
    data_reference_name="data"
)

train_model = PythonScriptStep(
    script_name="./train.py",
    name="fit-nlp-model",
    inputs=[inputdata.as_download(path_on_compute="./data")],
    runconfig=run_config,
    compute_target=compute_target,
)

pipeline = Pipeline(
    workspace=ws,
    steps=[train_model],
    description="Builds Keras model for detecting component defects",
)
예제 #28
0
    #step1
    cluster_name = "cpucluster"

    try:
        compute_target_cpu = ComputeTarget(workspace=ws, name=cluster_name)
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_D3_V2', max_nodes=1, min_nodes=1)
        compute_target_cpu = ComputeTarget.create(ws, cluster_name,
                                                  compute_config)
        compute_target_cpu.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=0)

    input_data_ref = DataReference(
        datastore=def_blob_store,
        data_reference_name="input_data_ref",
        path_on_datastore=f"{project_config['project_name']}/data/")

    processed_data_ref = PipelineData("processed_data_ref",
                                      datastore=def_blob_store)

    run_config = RunConfiguration()
    run_config.environment.docker.enabled = True
    run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    run_config.environment.python.user_managed_dependencies = False
    pip_packages = [
        "azureml-sdk==1.0.17", "scikit-learn==0.21.3", "download==0.3.4",
        "pandas==0.25.1", "spacy==2.1.4", "numpy==1.17.2"
    ]

    run_config.environment.python.conda_dependencies = CondaDependencies.create(
예제 #29
0
# ## Pipeline definition
#
#
# The Azure ML pipeline is composed of two steps:
#
#  - Data pre-processing which consist of one-hot encoding categorical features, normalization of the features set, spliting of dataset into training/testing sets and finally writing out the output to storage.
#
#  - Hyperdrive step that tune and train the deep kernel learning model using GPytorch and Pytorch estimator
#%% [markdown]
# ## Pipeline data input/output
#
# Here, we define the input and intermediary dataset that will be used by the pipeline steps.

#%%
input_dir = DataReference(datastore=default_store,
                          data_reference_name="input_data",
                          path_on_datastore="churn")

processed_dir = PipelineData(name='processed_data', datastore=default_store)

#%% [markdown]
# ## Pipeline 1st step: Data Preprocessing
#
# We start by defining the run configuration with the needed dependencies by the preprocessing step.
#
# In the cell that follow, we compose the first step of the pipeline.
#

#%%
cd = CondaDependencies()
cd.add_conda_package('pandas')
# Configurer des ressources Machine Learning
# help(Workspace)
ws = Workspace(subscription_id=sub_id,
               resource_group=res_grp,
               workspace_name=workspace_n)

# Configurer un magasin de données
# Default datastore
def_data_store = ws.get_default_datastore()

# Configurer la référence de données
# créer une source de données susceptible d’être référencée dans un pipeline en tant qu’entrée ou étape.
# Dans un pipeline, une source de données est représentée par un objet DataReference.
from azureml.data.data_reference import DataReference
blob_input_data = DataReference(datastore=def_data_store,
                                data_reference_name=data_ref,
                                path_on_datastore=data_filepath)

# Les données intermédiaires (ou la sortie d’une étape) sont représentées par un objet PipelineData.
from azureml.pipeline.core import PipelineData
output_data1 = PipelineData("output_data1",
                            datastore=def_data_store,
                            output_name=model_pklname)

# Configurer la cible de calcul
# créer une capacité de calcul Azure Machine Learning pour exécuter vos étapes
from azureml.core.compute import ComputeTarget, AmlCompute
compute_name = "computeuh"
vm_size = 'STANDARD_D2_V2'
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]