Esempio n. 1
0
    def as_input(self, name=None):
        """Specify how to consume the output as an input in subsequent pipeline steps.

        :param name: The name of the input specific to the run.
        :type name: str
        :return: A :class:`azureml.data.dataset_consumption_config.DatasetConsumptionConfig` instance describing
            how to deliver the input data.
        :rtype: azureml.data.dataset_consumption_config.DatasetConsumptionConfig
        """
        from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

        name = name or self.__class__._generate_random_name('input')
        return DatasetConsumptionConfig(name, self, DIRECT_MODE)
Esempio n. 2
0
    def as_named_input(self, name):
        """Provide a name for this dataset which will be used to retrieve the materialized dataset in the run.

        .. remarks::

            The name here will only be applicable inside an Azure Machine Learning run. The name must only contain
            alphanumeric and underscore characters so it can be made available as an environment variable. You can use
            this name to retrieve the dataset in the context of a run using two approaches:

            * Environment Variable:
                The name will be the environment variable name and the materialized dataset will
                be made available as the value of the environment variable. If the dataset is downloaded or mounted,
                the value will be the downloaded/mounted path. For example:

            .. code-block:: python

                # in your job submission notebook/script:
                dataset.as_named_input('foo').as_download('/tmp/dataset')

                # in the script that will be executed in the run
                import os
                path = os.environ['foo'] # path will be /tmp/dataset

            .. note::
                If the dataset is set to direct mode, then the value will be the dataset ID. You can then
                retrieve the dataset object by doing `Dataset.get_by_id(os.environ['foo'])`

            * Run.input_datasets:
                This is a dictionary where the key will be the dataset name you specified in this
                method and the value will be the materialized dataset. For downloaded and mounted dataset, the value
                will be the downloaded/mounted path. For direct mode, the value will be the same dataset object you
                specified in your job submission script.

            .. code-block:: python

                # in your job submission notebook/script:
                dataset.as_named_input('foo') # direct mode

                # in the script that will be executed in the run
                run = Run.get_context()
                run.input_datasets['foo'] # this returns the dataset object from above.


        :param name: The name of the dataset for the run.
        :type name: str
        :return: The configuration object describing how the Dataset should be materialized in the run.
        :rtype: azureml.data.dataset_consumption_config.DatasetConsumptionConfig
        """
        return DatasetConsumptionConfig(name, self)
Esempio n. 3
0
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.steps import ParallelRunConfig
from azureml.pipeline.steps import ParallelRunStep

print("SDK version:", azureml.core.VERSION)

dataset_name = 'grib-dataset'

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

datastore = ws.get_default_datastore()

input_ds = Dataset.get_by_name(ws, dataset_name)
batch_data = DatasetConsumptionConfig("batch_dataset", input_ds, mode='mount')

output_dir = PipelineData(name='batch_output', datastore=datastore)

parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path='convert_parallel.yml')

batch_step = ParallelRunStep(name="batch-conversion-step",
                             parallel_run_config=parallel_run_config,
                             arguments=['--data_output_path', output_dir],
                             inputs=[batch_data],
                             output=output_dir,
                             allow_reuse=False)

steps = [batch_step]
ws = Workspace.from_config()
print(
    f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}'
)

print('Loading parallel runconfig for pipeline')
parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path=args.runconfig)

print('Loading default batch dataset')
batch_dataset = Dataset.get_by_name(ws, args.dataset)

# Parametrize dataset input and dataset output name (batch scoring result) to the pipeline
batch_dataset_parameter = PipelineParameter(name="batch_dataset",
                                            default_value=batch_dataset)
batch_dataset_consumption = DatasetConsumptionConfig(
    "batch_dataset", batch_dataset_parameter).as_mount()

datastore = ws.get_default_datastore()
output_dataset_name = "batch_scoring_results"

# Existing, GA-code - does not allow to specify the path on the datastore
# output_dataset = PipelineData(name='batch_output', datastore=datastore).as_dataset()
# output_dataset = output_dataset.register(name=output_dataset_name, create_new_version=True)

# New code, not GA - does allow to specify the path on the datstore
destination_on_datastore = (datastore, 'output_dataset_name/')
output_dataset = OutputFileDatasetConfig(
    name='batch_results',
    destination=destination_on_datastore).register_on_complete(
        name=output_dataset_name)
Esempio n. 5
0
ws = Workspace.from_config()
print(
    f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}'
)

print('Loading runconfig for pipeline')
runconfig = RunConfiguration.load(args.runconfig)
runconfig_register = RunConfiguration.load(args.runconfig_register)

print('Loading dataset')
training_dataset = Dataset.get_by_name(ws, args.dataset)

# Parametrize dataset input to the pipeline
training_dataset_parameter = PipelineParameter(name="training_dataset",
                                               default_value=training_dataset)
training_dataset_consumption = DatasetConsumptionConfig(
    "training_dataset", training_dataset_parameter).as_mount()

train_step = PythonScriptStep(
    name="train-step",
    runconfig=runconfig,
    source_directory=args.source_directory,
    script_name=runconfig.script,
    arguments=['--data_path', training_dataset_consumption],
    inputs=[training_dataset_consumption],
    allow_reuse=False)

register_step = PythonScriptStep(name="register-step",
                                 runconfig=runconfig_register,
                                 source_directory=args.source_directory,
                                 arguments=['--model_name', args.model_name],
                                 script_name=runconfig_register.script,
Esempio n. 6
0
default_dataset_name = 'german-credit-train-tutorial'

print(f'Azure ML SDK version: {azureml.core.VERSION}')

# Connect to the workspace
ws = Workspace.from_config()
print(f'WS name: {ws.name}')
print(f'Region: {ws.location}')
print(f'Subscription id: {ws.subscription_id}')
print(f'Resource group: {ws.resource_group}')

default_training_dataset = Dataset.get_by_name(ws, default_dataset_name)

# Parametrize dataset input to the pipeline
training_dataset_parameter = PipelineParameter(name='training_dataset', default_value=default_training_dataset)
training_dataset_consumption = DatasetConsumptionConfig('training_dataset', training_dataset_parameter).as_download()

# Load runconfig from earlier exercise and create pipeline
runconfig = RunConfiguration.load(os.path.join(source_directory, 'runconfig.yml'))

train_step = PythonScriptStep(name='train-step',
                        source_directory=source_directory,
                        script_name='train.py',
                        arguments=['--data-path', training_dataset_consumption],
                        inputs=[training_dataset_consumption],
                        runconfig=runconfig,
                        allow_reuse=False)

steps = [train_step]

pipeline = Pipeline(workspace=ws, steps=steps)
Esempio n. 7
0
    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True,
                                       min_node_count=None,
                                       timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster.
print(compute_target.get_status().serialize())

def_data_store = ws.get_default_datastore()

from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import PipelineParameter

pipeline_param = PipelineParameter(name="mnist_param", default_value=dataset)
input_mnist_ds_consumption = DatasetConsumptionConfig(
    "minist_param_config", pipeline_param).as_mount()

from azureml.pipeline.core import Pipeline, PipelineData

output_dir = PipelineData(name="inferences",
                          datastore=def_data_store,
                          output_path_on_compute="mnist/results")

from azureml.core import Model
from azureml.core.model import Model
from azureml.core import Run
from azureml.core.dataset import Dataset

from azureml.core import Workspace, Dataset

from azureml.core.authentication import ServicePrincipalAuthentication