Esempio n. 1
0
    def create_pipeline(self, steps):
        step_sequence = StepSequence(steps=steps)

        collection_preprocessing_pipeline = Pipeline(workspace=self.ws,
                                                     steps=[step_sequence])
        collection_preprocessing_pipeline.validate()
        print("Collection and preprocessing pipeline built and validated")

        return Experiment(self.ws, 'Collection-preprocessing').submit(
            collection_preprocessing_pipeline, regenerate_outputs=False)
Esempio n. 2
0
    def create_pipeline(self, steps):
        step_sequence = StepSequence(steps=steps)

        train_pipeline = Pipeline(workspace=self.ws, steps=step_sequence)
        train_pipeline.validate()
        print("Pipeline register built and validated")

        Experiment(self.ws,
                   'Register-train-deploy').submit(train_pipeline,
                                                   regenerate_outputs=False)
        print("Register, train and deploy experiment created and submitted")
    def run_pipeline(self, experiment_name, tags=None):
        """
        submits batch inference pipeline as an experiment run

        :param str experiment_name: [required] name of the experiment in azureml
        :param dict tags: [optional] dictionary of tags
        :returns: run
        :rtype: Run
        """
        if tags is None:
            tags = self.pipeline_tags
        step_sequence = StepSequence(steps=self.steps)
        pipeline = Pipeline(workspace=self.ws, steps=step_sequence)
        run = Experiment(self.ws, experiment_name).submit(
            pipeline, tags=tags, continue_on_step_failure=False)
        return run
Esempio n. 4
0
        "--num_workers",
        num_workers_param,
        "--train_batch_size",
        train_batch_size_param,
        "--eval_batch_size",
        eval_batch_size_param,
        "--output_dir",
        "./outputs",
        "--do_train",
        "--do_predict",
    ],
    inputs=[prepared_dataset.as_mount()],
    compute_target=compute_target,
)

step_sequence = StepSequence(steps=[prepare_step, train_step])
pipeline = Pipeline(workspace, steps=step_sequence)

# Submit single experiment run
run = experiment.submit(pipeline)

# Run the three listed models over 5 random seeds. (15 experiment runs total)
# for seed in range(5):
#     for model in ["distilbert-base-cased", "bert-base-cased", "albert-base-v2"]:
#         run = experiment.submit(
#             pipeline,
#             pipeline_parameters={
#                 "model_name_or_path": model,
#                 "task": "cola",
#                 "train_batch_size": 32,
#                 "eval_batch_size": 32,
Esempio n. 5
0
def build_pipeline_steps(automlconfig: AutoMLConfig,
                         data: Dataset,
                         target_column: str,
                         compute_target: ComputeTarget,
                         group_column_names: list,
                         time_column_name: str,
                         deploy: bool,
                         service_name: str = 'grouping-demo') -> StepSequence:
    steps = []

    metrics_output_name = 'metrics_{}'
    best_model_output_name = 'best_model_{}'
    count = 0
    model_names = []

    # get all automl configs by group
    configs = _get_configs(automlconfig, data, target_column, compute_target, group_column_names)

    # build a runconfig for register model
    register_config = RunConfiguration()
    cd = CondaDependencies()
    cd.add_pip_package('azureml-pipeline')
    register_config.environment.python.conda_dependencies = cd

    # create each automl step end-to-end (train, register)
    for group_name, conf in configs.items():
        # create automl metrics output
        metrics_data = PipelineData(
            name='metrics_data_{}'.format(group_name),
            pipeline_output_name=metrics_output_name.format(group_name),
            training_output=TrainingOutput(type='Metrics'))
        # create automl model output
        model_data = PipelineData(
            name='model_data_{}'.format(group_name),
            pipeline_output_name=best_model_output_name.format(group_name),
            training_output=TrainingOutput(type='Model', metric=conf.user_settings['primary_metric']))

        automl_step = AutoMLStep(
            name='automl_{}'.format(group_name),
            automl_config=conf,
            outputs=[metrics_data, model_data],
            allow_reuse=True)
        steps.append(automl_step)

        # pass the group name as a parameter to the register step ->
        # this will become the name of the model for this group.
        group_name_param = PipelineParameter("group_name_{}".format(count), default_value=group_name)
        count += 1

        reg_model_step = PythonScriptStep(
            'register.py',
            name='register_{}'.format(group_name),
            arguments=["--model_name", group_name_param, "--model_path", model_data],
            inputs=[model_data],
            compute_target=compute_target,
            runconfig=register_config,
            source_directory="register",
            allow_reuse=True
        )
        steps.append(reg_model_step)
        model_names.append(group_name)

    final_steps = steps
    if deploy:
        # modify the conda dependencies to ensure we pick up correct
        # versions of azureml-defaults and azureml-train-automl
        cd = CondaDependencies.create(pip_packages=['azureml-defaults', 'azureml-train-automl'])
        automl_deps = CondaDependencies(conda_dependencies_file_path='deploy/myenv.yml')
        cd._merge_dependencies(automl_deps)
        cd.save('deploy/myenv.yml')

        # add deployment step
        pp_group_column_names = PipelineParameter(
            "group_column_names",
            default_value="#####".join(list(reversed(group_column_names))))

        pp_model_names = PipelineParameter(
            "model_names",
            default_value=json.dumps(model_names))

        pp_service_name = PipelineParameter(
            "service_name",
            default_value=service_name)

        deployment_step = PythonScriptStep(
            'deploy.py',
            name='service_deploy',
            arguments=["--group_column_names", pp_group_column_names,
                       "--model_names", pp_model_names,
                       "--service_name", pp_service_name,
                       "--time_column_name", time_column_name],
            compute_target=compute_target,
            runconfig=RunConfiguration(),
            source_directory="deploy"
        )
        final_steps = StepSequence(steps=[steps, deployment_step])

    return final_steps
Esempio n. 6
0
# Configuring a PythonScriptStep with a RunConfiguration
# that includes debugpy and azure-debug-relay

run_config = RunConfiguration()
conda_dependencies = run_config.environment.python.conda_dependencies
conda_dependencies.add_conda_package("pip")
conda_dependencies.add_conda_package("scikit-learn")
conda_dependencies.add_pip_package("azureml-sdk==" + amlcore.__version__)
conda_dependencies.add_pip_package("azureml-defaults")

train_step = PythonScriptStep(name='Train Step with Debugging',
                              script_name="diabetes_train_2.py",
                              source_directory="./scripts",
                              compute_target=compute_target,
                              runconfig=run_config,
                              allow_reuse=False)

print('About to submit')

# Submitting an Azure ML Pipeline Run
step_sequence = StepSequence(steps=[train_step])
pipeline = Pipeline(workspace, steps=step_sequence)
experiment = Experiment(workspace=workspace, name=experiment_name)
run = experiment.submit(pipeline)
print('submitted')
# Show the running experiment run in the notebook widget
#RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()
Esempio n. 7
0
    mini_batch_size="1",
    error_threshold=30,
    output_action="append_row",
    environment=env,
    compute_target='cpu-cluster',
    append_row_file_name="my_outputs.txt",
    run_invocation_timeout=1200,
    node_count=1)

parallelrun_step = ParallelRunStep(
    name="parallelapicalls",
    parallel_run_config=parallel_run_config,
    arguments=["--arg1", string_pipeline_param],
    inputs=[dataset.as_named_input("inputds")],
    output=output_dir
    #models=[ model ] #not needed as its only relevant in batch inferencing
    #arguments=[ ],
    #allow_reuse=True
)

print('parallelrun_step created')

step_sequence = StepSequence(steps=[StepToWriteDateFile, parallelrun_step])
pipeline = Pipeline(workspace=workspace, steps=step_sequence)
print("pipeline with the 2 steps created")

pipeline.publish(name='New pipeline to pull usage',
                 description='pull usage data in parallel',
                 version="1.0",
                 continue_on_step_failure=None)
Esempio n. 8
0
compute_target = ComputeTarget(ws, name='cpu-cluster')

aml_run_config = RunConfiguration()
aml_run_config.environment = Environment.get(workspace=ws, name="sklearn-env")
aml_run_config.target = compute_target

train_model = PythonScriptStep(script_name='src/execute_train.py',
                               compute_target=compute_target,
                               runconfig=aml_run_config)

deploy_model = PythonScriptStep(script_name="src/deploy_model.py",
                                compute_target=compute_target,
                                runconfig=aml_run_config)

steps = [train_model, deploy_model]
step_seq = StepSequence(steps=steps)
pipeline = Pipeline(workspace=ws, steps=step_seq)

pp = pipeline.publish(
    name="TitanicDeploymentPipeline",
    description="Training and deployment pipeline for our titanic API.",
    version="1.0")

# We can also set up a trigger based schedule using the Datastore class - https://docs.microsoft.com/en-us/azure/machine-learning/how-to-schedule-pipelines#create-a-time-based-schedule
recurrence = ScheduleRecurrence(frequency="Month",
                                interval=1,
                                start_time='2020-11-01T00:00:00')
recurring_schedule = Schedule.create(ws,
                                     name="TitanicRetrainingSchedule",
                                     description="Once a month training",
                                     pipeline_id=pp.id,