예제 #1
0
파일: pipeline.py 프로젝트: naivelogic/seer
def process_step(datastore: Datastore, compute: ComputeTarget,
                 path_on_datastore: str) -> (PipelineData, EstimatorStep):
    datapath = DataPath(datastore=datastore,
                        path_on_datastore=path_on_datastore)
    data_path_pipeline_param = (PipelineParameter(name="data",
                                                  default_value=datapath),
                                DataPathComputeBinding(mode='mount'))

    seer_tfrecords = PipelineData("tfrecords_set",
                                  datastore=datastore,
                                  is_directory=True)

    prep = Estimator(source_directory='.',
                     compute_target=compute,
                     entry_script='prep.py',
                     pip_requirements_file='requirements.txt')

    prepStep = EstimatorStep(name='Data Preparation',
                             estimator=prep,
                             estimator_entry_script_arguments=[
                                 "--source_path", data_path_pipeline_param,
                                 "--target_path", seer_tfrecords
                             ],
                             inputs=[data_path_pipeline_param],
                             outputs=[seer_tfrecords],
                             compute_target=compute)

    return seer_tfrecords, prepStep
예제 #2
0
# 2. Data Process Step
# 3. Training Step
# 4. Model Registration Step
# 5. Pipeline registration
# 6. Submit the pipeline for execution


## Pipeline Parameters ##
# We need to tell the Pipeline what it needs to learn to see!

source_dataset = DataPath(
    datastore=ds, 
    path_on_datastore="simpsonslego-v3")

source_dataset_param = (PipelineParameter(name="source_dataset",default_value=source_dataset),
                          DataPathComputeBinding())

# Configuration for data prep and training steps #

## Data Process Step ##
# prep.py file versions our data in our data source #

# Output location for the pre-proccessed trainings images
training_data_location = PipelineData(name="simpsons_training_data", datastore=ds)

# Create the pre-process step
preProcessDataStep = PythonScriptStep(
    name="Pre-process data",
    script_name="steps/prep.py",
    compute_target=cluster,
    inputs=[source_dataset_param],
예제 #3
0
# The following will be created and then run:
# 1. Pipeline Parameters
# 2. Data Process Step
# 3. Training Step
# 4. Model Registration Step
# 5. Pipeline registration
# 6. Submit the pipeline for execution

## Pipeline Parameters ##
# We need to tell the Pipeline what it needs to learn to see!

datapath = DataPath(datastore=datastore, path_on_datastore=datastorepath)
data_path_pipeline_param = (PipelineParameter(name="data",
                                              default_value=datapath),
                            DataPathComputeBinding(mode='mount'))

# Configuration for data prep and training steps #

dataprepEnvironment = Environment.from_pip_requirements(
    'dataprepenv', 'requirements-dataprepandtraining.txt')
dataprepRunConfig = RunConfiguration()
dataprepRunConfig.environment = dataprepEnvironment

## Data Process Step ##
# parse.py file parses the images in our data source #

seer_tfrecords = PipelineData("tfrecords_set",
                              datastore=datastore,
                              is_directory=True)
def create_pipeline(workspace):
    # Retreive compute cluster
    compute_target = workspace.compute_targets[args.compute_target]

    # Setup batch scoring environment from conda dependencies
    environment = Environment.from_conda_specification(
        name=args.environment_name, file_path=args.environment_specification
    )

    # Add environment variables
    environment.environment_variables = {
        "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string
    }

    # Enable docker run
    environment.docker.enabled = True

    # Create run config
    run_config = RunConfiguration()
    run_config.environment = environment

    # Retreive input and output datastores
    input_datastore = Datastore(workspace, args.input_datastore_name)
    output_datastore = Datastore(workspace, args.output_datastore_name)

    # Define build id parameter
    build_id_param = PipelineParameter("build_id", default_value=args.build_id)

    # Define input datapath parameter
    input_datapath = DataPath(datastore=input_datastore, path_on_datastore="")
    input_datapath_param = (
        PipelineParameter(name="input_datapath", default_value=input_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define output datapath parameter
    output_datapath = DataPath(datastore=output_datastore, path_on_datastore="")
    output_datapath_param = (
        PipelineParameter(name="output_datapath", default_value=output_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define score step for pipeline
    score_step = PythonScriptStep(
        name="score_data",
        compute_target=compute_target,
        source_directory="src/score",
        script_name="score.py",
        inputs=[input_datapath_param, output_datapath_param],
        runconfig=run_config,
        allow_reuse=False,
        arguments=[
            "--build_id",
            build_id_param,
            "--input_datapath",
            input_datapath_param,
            "--output_datapath",
            output_datapath_param,
        ],
    )

    # Define pipeline for batch scoring
    pipeline = Pipeline(workspace=workspace, steps=[score_step])

    return pipeline
environment = Environment.get(workspace, name="AzureML-Scikit-learn-0.20.3")
environment.docker.enabled = True

run_config = RunConfiguration()
run_config.environment = environment

compute_target = workspace.compute_targets["cpu"]
run_config.target = compute_target

train_features_datapath = DataPath(
    datastore=blobstore, path_on_datastore="training_set_features.csv")
train_features_path_parameter = PipelineParameter(
    name="train_features", default_value=train_features_datapath)
train_features_path = (train_features_path_parameter,
                       DataPathComputeBinding(mode="mount"))

train_labels_datapath = DataPath(datastore=blobstore,
                                 path_on_datastore="training_set_labels.csv")
train_labels_path_parameter = PipelineParameter(
    name="train_labels", default_value=train_labels_datapath)
train_labels_path = (train_labels_path_parameter,
                     DataPathComputeBinding(mode="mount"))

test_features_datapath = DataPath(datastore=blobstore,
                                  path_on_datastore="test_set_features.csv")
test_features_path_parameter = PipelineParameter(
    name="test_features", default_value=test_features_datapath)
test_features_path = (test_features_path_parameter,
                      DataPathComputeBinding(mode="mount"))
예제 #6
0
    def __init__(self,
                 source_directory,
                 script=None,
                 arguments=None,
                 run_config=None,
                 _telemetry_values=None,
                 compute_target=None,
                 environment=None,
                 distributed_job_config=None,
                 resume_from=None,
                 max_run_duration_seconds=MAX_DURATION_SECONDS_DEFAULT,
                 command=None,
                 docker_runtime_config=None):
        """Class ScriptRunConfig constructor.

        :param source_directory: A local directory containing code files needed for a run.
        :type source_directory: str
        :param script: The file path relative to the source_directory of the script to be run.
        :type script: str
        :param arguments: Optional command line arguments to pass to the training script.
            Arguments are passed in pairs, for example, ['--arg1', arg1_val, '--arg2', arg2_val].
        :type arguments: builtin.list[str]
        :param run_config: Optional run configuration to use.
        :type run_config: azureml.core.runconfig.RunConfiguration
        :param _telemetry_values: Internal use only.
        :type _telemetry_values: dict
        :param compute_target: The compute target where training will happen. This can either be a ComputeTarget
            object, the name of an existing ComputeTarget, or the string "local". If no compute target is
            specified, your local machine will be used.
        :type compute_target: azureml.core.compute_target.AbstractComputeTarget or str
        :param environment: The environment to use for the run. If no environment is specified,
            azureml.core.runconfig.DEFAULT_CPU_IMAGE will be used as the Docker image for the run.
        :type environment: azureml.core.environment.Environment
        :param distributed_job_config: For jobs that require additional distributed job-specific configurations.
        :type distributed_job_config: azureml.core.runconfig.TensorflowConfiguration,
            azureml.core.runconfig.MpiConfiguration, or azureml.core.runconfig.PyTorchConfiguration
        :param resume_from: The DataPath containing the checkpoint or model files from which to resume the
            experiment.
        :type resume_from: azureml.data.datapath.DataPath
        :param max_run_duration_seconds: The maximum time allowed for the run. The system will attempt to
            automatically cancel the run if it took longer than this value.
        :type max_run_duration_seconds: int
        :param command: The command to be submitted for the run. The command property can also be used instead of
            script/arguments. Both command and script/argument properties cannot be used together to submit a run.
            To submit a script file using the command property - ['python', 'train.py', '--arg1', arg1_val]
            To run an actual command - ['ls']
        :type command: builtin.list[str] or str
        :param docker_runtime_config: For jobs that require Docker runtime-specific configurations.
        :type docker_runtime_config: azureml.core.runconfig.DockerConfiguration
        """
        self.source_directory = source_directory
        self.script = script
        self.command = command
        self.arguments = arguments

        if run_config:
            if (max_run_duration_seconds != ScriptRunConfig.MAX_DURATION_SECONDS_DEFAULT) or \
                    any([compute_target, environment, distributed_job_config, docker_runtime_config]):
                logging.warning(
                    "If 'run_config' is specified, the following parameters will be "
                    "ignored: 'compute_target', 'environment', 'distributed_job_config', "
                    "'max_run_duration_seconds', and 'docker_runtime_config'.")

            if run_config.script and self.script:
                logging.warning(
                    "If 'script' has been provided here and a script file name has been specified in "
                    "'run_config', 'script' provided in ScriptRunConfig initialization will take "
                    "precedence.")
            elif run_config.script and not self.script:
                self.script = run_config.script

            if run_config.arguments and self.arguments:
                logging.warning(
                    "If 'arguments' has been provided here and arguments have been specified in "
                    "'run_config', 'arguments' provided in ScriptRunConfig initialization will "
                    "take precedence.")
            elif run_config.arguments and not self.arguments:
                self.arguments = run_config.arguments

            if run_config.command and self.command:
                logging.warning(
                    "If 'command' has been provided here and command has been specified in "
                    "'run_config', 'command' provided in ScriptRunConfig initialization will "
                    "take precedence.")
            elif run_config.command and not self.command:
                self.command = run_config.command

            self.run_config = run_config
        else:
            self.run_config = RunConfiguration()
            self.run_config.target = compute_target if compute_target else "local"
            self.run_config.environment = environment if environment else EnvironmentDefinition(
            )
            self.run_config.max_run_duration_seconds = max_run_duration_seconds

            if distributed_job_config:
                if not isinstance(distributed_job_config,
                                  (TensorflowConfiguration, MpiConfiguration,
                                   PyTorchConfiguration)):
                    raise RunConfigurationException(
                        "'distributed_job_config' must be an "
                        "'azureml.core.runconfig.TensorflowConfiguration', "
                        "'azureml.core.runconfig.MpiConfiguration', or "
                        "'azureml.core.runconfig.PyTorchConfiguration' object."
                    )

                self.run_config.node_count = (
                    distributed_job_config.worker_count if isinstance(
                        distributed_job_config, TensorflowConfiguration) else
                    distributed_job_config.node_count)

                if isinstance(distributed_job_config, TensorflowConfiguration):
                    self.run_config.tensorflow = distributed_job_config
                    self.run_config.framework = "TensorFlow"
                    self.run_config.communicator = "ParameterServer"
                elif isinstance(distributed_job_config, MpiConfiguration):
                    self.run_config.mpi = distributed_job_config
                    self.run_config.framework = "Python"
                    self.run_config.communicator = "IntelMpi"
                elif isinstance(distributed_job_config, PyTorchConfiguration):
                    self.run_config.pytorch = distributed_job_config
                    self.run_config.framework = "PyTorch"
                    self.run_config.communicator = distributed_job_config.communication_backend

            if docker_runtime_config:
                if not isinstance(docker_runtime_config, DockerConfiguration):
                    raise RunConfigurationException(
                        "'docker_runtime_config' must be a 'DockerConfiguration' "
                        "object.")
                self.run_config.docker = docker_runtime_config
            elif environment and environment.docker:
                # Docker configuration in run config is higher priority than docker settings
                # in environment docker section, explicitly assign the setting values here
                # to keep backward compatibility.
                self.run_config.docker = DockerConfiguration(
                    use_docker=environment.docker._enabled,
                    shm_size=environment.docker._shm_size,
                    shared_volumes=environment.docker._shared_volumes,
                    arguments=environment.docker._arguments)

        if resume_from:
            if not isinstance(resume_from, DataPath):
                raise UserErrorException(
                    "resume_from parameter should be of type DataPath. "
                    "Found {}.".format(type(resume_from)))
            outputs_data_reference = resume_from. \
                create_data_reference(data_reference_name="MODEL_LOCATION",
                                      datapath_compute_binding=DataPathComputeBinding(mode="mount"))
            if self.arguments is None:
                self.arguments = []

            self.arguments.extend(
                ["--resume-from", str(outputs_data_reference)])
            self.run_config.data_references[outputs_data_reference.data_reference_name] = \
                outputs_data_reference.to_config()

        self._telemetry_values = _telemetry_values
예제 #7
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets - Create PipelineParameter for dynamic pipeline input
    print("Setting up datasets with dynamic input")
    data_prep_input_path = DataPath(
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_input_name", "workspaceblobstore")),
        path_on_datastore=
        "golden/Atlantis/PAX1/15-Mar-2020-23-37-50-279971/PAX1.parquet/")
    data_prep_input_path_pipeline_parameter = PipelineParameter(
        name="input_path", default_value=data_prep_input_path)
    data_prep_input = (data_prep_input_path_pipeline_parameter,
                       DataPathComputeBinding(mode="mount"))
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []) +
        ["--input-datapath", data_prep_input],
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ############################################
    ### Creating inference Parallel Run Step ###
    ############################################

    # Load settings
    print("Loading settings")
    batch_inference_step_path = os.path.join("steps", "batch_inference")
    with open(os.path.join(batch_inference_step_path, "step.json")) as f:
        batch_inference_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    batch_inference_input = data_prep_output.as_named_input(
        name=batch_inference_settings.get("dataset_input_name", None))
    batch_inference_output = PipelineData(
        name=batch_inference_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=batch_inference_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #batch_inference_output.register(
    #    name=batch_inference_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    batch_inference_dependencies = CondaDependencies.create(
        pip_packages=batch_inference_settings.get("pip_packages", []),
        conda_packages=batch_inference_settings.get("conda_packages", []),
        python_version=batch_inference_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=batch_inference_dependencies,
        framework=batch_inference_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    batch_inference_compute_target = ComputeTarget(
        workspace=workspace,
        name=batch_inference_settings.get("compute_target_name", None))

    # Create python step
    print("Creating Step")
    batch_inference = PythonScriptStep(
        name=batch_inference_settings.get("step_name", None),
        script_name=batch_inference_settings.get("script_name", None),
        arguments=batch_inference_settings.get("arguments", []),
        compute_target=batch_inference_compute_target,
        runconfig=data_prep_run_config,
        inputs=[batch_inference_input],
        outputs=[batch_inference_output],
        params=batch_inference_settings.get("parameters", []),
        source_directory=batch_inference_step_path,
        allow_reuse=batch_inference_settings.get("allow_reuse", True),
        version=batch_inference_settings.get("version", None),
    )

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[batch_inference],
        description="Batch Inference Pipeline",
    )

    return pipeline