def train_step(train_dir, compute_target):

    max_depth = PipelineParameter(name='max_depth', default_value=5)
    n_estimators = PipelineParameter(name='n_estimators', default_value=500)

    model_dir = PipelineData(name='model_dir',
                             pipeline_output_name='model_dir',
                             datastore=train_dir.datastore,
                             output_mode='mount',
                             is_directory=True)

    outputs = [model_dir]
    outputs_map = {'model_dir': model_dir}

    estimator = SKLearn(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        entry_script='train.py',
                        compute_target=compute_target)

    step = EstimatorStep(estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--train_dir', train_dir, '--output_dir',
                             model_dir, '--max_depth', max_depth,
                             '--n_estimators', n_estimators
                         ],
                         inputs=[train_dir],
                         compute_target=compute_target,
                         outputs=outputs,
                         allow_reuse=False)

    return step, outputs_map
def evaluate_step(model_dir, test_dir, compute_target):

    accuracy_file = PipelineData(name='accuracy_file',
                                 pipeline_output_name='accuracy_file',
                                 datastore=test_dir.datastore,
                                 output_mode='mount',
                                 is_directory=False)

    outputs = [accuracy_file]
    outputs_map = {'accuracy_file': accuracy_file}

    estimator = SKLearn(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        entry_script='evaluate.py',
                        compute_target=compute_target)

    step = EstimatorStep(estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--test_dir', test_dir, '--model_dir', model_dir,
                             '--accuracy_file', accuracy_file
                         ],
                         inputs=[model_dir, test_dir],
                         outputs=outputs,
                         compute_target=compute_target,
                         allow_reuse=True)

    return step, outputs_map
Ejemplo n.º 3
0
def process_step(datastore: Datastore, compute: ComputeTarget,
                 path_on_datastore: str) -> (PipelineData, EstimatorStep):
    datapath = DataPath(datastore=datastore,
                        path_on_datastore=path_on_datastore)
    data_path_pipeline_param = (PipelineParameter(name="data",
                                                  default_value=datapath),
                                DataPathComputeBinding(mode='mount'))

    seer_tfrecords = PipelineData("tfrecords_set",
                                  datastore=datastore,
                                  is_directory=True)

    prep = Estimator(source_directory='.',
                     compute_target=compute,
                     entry_script='prep.py',
                     pip_requirements_file='requirements.txt')

    prepStep = EstimatorStep(name='Data Preparation',
                             estimator=prep,
                             estimator_entry_script_arguments=[
                                 "--source_path", data_path_pipeline_param,
                                 "--target_path", seer_tfrecords
                             ],
                             inputs=[data_path_pipeline_param],
                             outputs=[seer_tfrecords],
                             compute_target=compute)

    return seer_tfrecords, prepStep
Ejemplo n.º 4
0
def train_step(train_dir, valid_dir, compute_target):
    '''
    This step will fine-tune a RESNET-18 model on our dataset using PyTorch. 
    It will use the corresponding input image directories as training and validation data.

    :param train_dir: The reference to the directory containing the training data
    :type train_dir: DataReference
    :param valid_dir: The reference to the directory containing the validation data
    :type valid_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The training step, step outputs dictionary (keys: model_dir)
    :rtype: EstimatorStep, dict
    '''

    num_epochs = PipelineParameter(name='num_epochs', default_value=25)
    batch_size = PipelineParameter(name='batch_size', default_value=16)
    learning_rate = PipelineParameter(name='learning_rate', default_value=0.001)
    momentum = PipelineParameter(name='momentum', default_value=0.9)

    model_dir = PipelineData(
        name='model_dir', 
        pipeline_output_name='model_dir',
        datastore=train_dir.datastore,
        output_mode='mount',
        is_directory=True)

    outputs = [model_dir]
    outputs_map = { 'model_dir': model_dir }

    estimator = PyTorch(
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        entry_script='train.py',
        framework_version='1.3',
        compute_target=compute_target,
        use_gpu=True)

    step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            '--train_dir', train_dir, 
            '--valid_dir', valid_dir, 
            '--output_dir', model_dir, 
            '--num_epochs', num_epochs, 
            '--batch_size', batch_size,
            '--learning_rate', learning_rate, 
            '--momentum', momentum
        ],
        inputs=[train_dir, valid_dir],
        compute_target=compute_target,
        outputs=outputs,
        allow_reuse=False)

    return step, outputs_map
Ejemplo n.º 5
0
def evaluate_step(model_dir, test_dir, compute_target):
    '''
    This step evaluates the trained model on the testing data and outputs the accuracy.

    :param model_dir: The reference to the directory containing the trained model
    :type model_dir: DataReference
    :param test_dir: The reference to the directory containing the testing data
    :type test_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The evaluate step, step outputs dictionary (keys: accuracy_file)
    :rtype: EstimatorStep, dict
    '''

    accuracy_file = PipelineData(
        name='accuracy_file', 
        pipeline_output_name='accuracy_file',
        datastore=test_dir.datastore,
        output_mode='mount',
        is_directory=False)

    outputs = [accuracy_file]
    outputs_map = { 'accuracy_file': accuracy_file }
    
    estimator = PyTorch(
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        entry_script='evaluate.py',
        framework_version='1.3',
        compute_target=compute_target,
        use_gpu=True)

    step = EstimatorStep(
        name="Evaluate Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            '--test_dir', test_dir, 
            '--model_dir', model_dir, 
            '--accuracy_file', accuracy_file
        ],
        inputs=[model_dir, test_dir],
        outputs=outputs,
        compute_target=compute_target,
        allow_reuse=True)

    return step, outputs_map
Ejemplo n.º 6
0
def estimator(data, store, compute):
    estimator = Estimator(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                          compute_target=compute,
                          entry_script='train.py',
                          pip_packages=['azureml-dataprep', 'lightgbm'])

    output = PipelineData("output", datastore=store)

    step = EstimatorStep(name=os.path.basename(__file__),
                         estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--input_dir', data, '--output_dir', output
                         ],
                         inputs=[data],
                         outputs=[output],
                         compute_target=estimator._compute_target,
                         allow_reuse=True)

    return step, output
Ejemplo n.º 7
0
def register_step(datastore: Datastore, input_data: PipelineData,
                  compute: ComputeTarget,
                  build: str) -> (PipelineData, EstimatorStep):
    seer_model = PipelineData("model", datastore=datastore, is_directory=True)

    register = Estimator(source_directory='.',
                         compute_target=compute,
                         entry_script='register.py')

    registerStep = EstimatorStep(name='Model Registration',
                                 estimator=register,
                                 estimator_entry_script_arguments=[
                                     "--source_path", input_data,
                                     "--target_path", seer_model, "--build",
                                     build
                                 ],
                                 inputs=[input_data],
                                 outputs=[seer_model],
                                 compute_target=compute)

    return seer_model, registerStep
Ejemplo n.º 8
0
def train_step(datastore: Datastore, input_data: PipelineData,
               compute: ComputeTarget) -> (PipelineData, EstimatorStep):
    seer_training = PipelineData("train",
                                 datastore=datastore,
                                 is_directory=True)

    train = Estimator(source_directory='.',
                      compute_target=compute,
                      entry_script='train.py',
                      use_gpu=True,
                      pip_requirements_file='requirements.txt')

    trainStep = EstimatorStep(name='Model Training',
                              estimator=train,
                              estimator_entry_script_arguments=[
                                  "--source_path", input_data, "--target_path",
                                  seer_training, "--epochs", 15, "--batch", 10,
                                  "--lr", 0.001
                              ],
                              inputs=[input_data],
                              outputs=[seer_training],
                              compute_target=compute)

    return seer_training, trainStep
source_directory = 'Training'
est = TensorFlow(source_directory=source_directory,
                 compute_target=compute_target_cpu,
                 entry_script='train.py',
                 use_gpu=False,
                 framework_version='1.13')

from azureml.pipeline.steps import EstimatorStep

trainingStep = EstimatorStep(name="Training-Step",
                             estimator=est,
                             estimator_entry_script_arguments=[
                                 "--input_data_location", processed_mnist_data,
                                 '--batch-size', 50, '--first-layer-neurons',
                                 300, '--second-layer-neurons', 100,
                                 '--learning-rate', 0.01, "--release_id", 0,
                                 '--model_name', model_name
                             ],
                             runconfig_pipeline_params=None,
                             inputs=[processed_mnist_data],
                             compute_target=compute_target_cpu)

print("Model Training Step is Completed")

# source directory
source_directory = 'RegisterModel'

modelEvalReg = PythonScriptStep(
    name="Evaluate and Register Model",
    script_name="evaluate_model.py",
    arguments=["--release_id", 0, '--model_name', model_name],
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = "AML-RG-" + os.environ.get("BASE_NAME")
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    aks_name = os.environ.get("AKS_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    experiment_name = os.environ.get("EXPERIMENT_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=['numpy', 'pandas', 'scikit-learn', 'keras'],
        pip_packages=[
            'azure', 'azureml-sdk', 'azure-storage', 'azure-storage-blob',
            'transformers>=2.1.1', 'tensorflow>=2.0.0', 'tensorflow-gpu>=2.0.0'
        ]))
    run_config.environment.docker.enabled = True

    datastore_name = 'tfworld'
    container_name = 'azure-service-classifier'
    account_name = 'johndatasets'
    sas_token = '?sv=2019-02-02&ss=bfqt&srt=sco&sp=rl&se=2021-06-02T03:40:25Z&st=2020-03-09T19:40:25Z&spr=https&sig=bUwK7AJUj2c%2Fr90Qf8O1sojF0w6wRFgL2c9zMVCWNPA%3D'

    try:
        existing_datastore = Datastore.get(aml_workspace, datastore_name)
    except:  # noqa: E722
        existing_datastore = Datastore \
            .register_azure_blob_container(workspace=aml_workspace,
                                           datastore_name=datastore_name,
                                           container_name=container_name,
                                           account_name=account_name,
                                           sas_token=sas_token
                                           )

    azure_dataset = Dataset.File.from_files(path=(existing_datastore, 'data'))
    azure_dataset = azure_dataset.register(
        workspace=aml_workspace,
        name='Azure Services Dataset',
        description='Dataset containing azure related posts on Stackoverflow',
        create_new_version=True)

    azure_dataset.to_path()
    input_data = azure_dataset.as_named_input('input_data1').as_mount(
        '/tmp/data')

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    max_seq_length = PipelineParameter(name="max_seq_length",
                                       default_value=128)
    learning_rate = PipelineParameter(name="learning_rate", default_value=3e-5)
    num_epochs = PipelineParameter(name="num_epochs", default_value=3)
    export_dir = PipelineParameter(name="export_dir",
                                   default_value="./outputs/exports")
    batch_size = PipelineParameter(name="batch_size", default_value=32)
    steps_per_epoch = PipelineParameter(name="steps_per_epoch",
                                        default_value=100)

    # initialize the TensorFlow estimator
    estimator = TensorFlow(source_directory=sources_directory_train,
                           entry_script=train_script_path,
                           compute_target=aml_compute,
                           framework_version='2.0',
                           use_gpu=True,
                           pip_packages=[
                               'transformers==2.0.0',
                               'azureml-dataprep[fuse,pandas]==1.3.0'
                           ])

    train_step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            "--data_dir", input_data, "--max_seq_length", max_seq_length,
            "--learning_rate", learning_rate, "--num_epochs", num_epochs,
            "--export_dir", export_dir, "--batch_size", batch_size,
            "--steps_per_epoch", steps_per_epoch
        ],
        compute_target=aml_compute,
        inputs=[input_data],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute,
        source_directory=sources_directory_train,
        arguments=[
            "--model_name",
            model_name,
            "--build_id",
            build_id,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Evaluate created")

    # Currently, the Evaluate step will automatically register
    # the model if it performs better. This step is based on a
    # previous version of the repo which utilized JSON files to
    # track evaluation results.

    evaluate_step.run_after(train_step)
    steps = [evaluate_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')

    response = published_pipeline.submit(  # noqa: F841
        workspace=aml_workspace,
        experiment_name=experiment_name)

    # Get AKS cluster for deployment
    aks_compute = get_aks(aml_workspace, aks_name)
    if aks_compute is not None:
        print(aks_compute)
Ejemplo n.º 11
0
# environment = Environment.from_conda_specification(name="sentiment-env", file_path="experiment-env.yml")
# environment.register(ws)
environment = Environment.get(ws, "sentiment-env")

estimator = TensorFlow(
    source_directory="imdb",
    entry_script="experiment.py",
    framework_version="2.1",
    conda_packages=["python=3.7.4", "tensorflow", "tensorflow-datasets"],
    pip_packages=["azureml-sdk[notebooks,automl,explain]"],
    compute_target="archi-trainer")

model_step = EstimatorStep(
    name="training model",
    estimator=estimator,
    compute_target="archi-trainer",
    estimator_entry_script_arguments=['--n-words', 5000, '--epochs', 2])
# register_step = PythonScriptStep(name="register pipeline", source_directory="sentiment_analysis", script_name="registration.py", compute_target="dummy", runconfig=run_config)
# register_step.run_after(model_step)

sentiment_pipe = Pipeline(workspace=ws, steps=[model_step])
sentiment_pipe.validate()

experiment = Experiment(workspace=ws, name="sentiment-analysis")
run = experiment.submit(config=sentiment_pipe)

run.wait_for_completion(show_output=True)

ds.upload('outputs/sentiment_model.h5',
          'models',
Ejemplo n.º 12
0
data_ref = DataReference(datastore=default_ds,
                         data_reference_name='data_ref',
                         path_on_datastore="config/")

estimator = Estimator(source_directory=experiment_folder,
                      compute_target=pipeline_cluster,
                      environment_definition=pipeline_run_config.environment,
                      entry_script='train.py')

# Step 1, run the estimator to train the model
train_step = EstimatorStep(
    name="Train Model",
    estimator=estimator,
    estimator_entry_script_arguments=[
        '--output_folder', model_folder, '--data_dir', data_ref
    ],
    inputs=[fraud_ds.as_named_input('fraud_train'), data_ref],
    outputs=[model_folder],
    compute_target=pipeline_cluster,
    allow_reuse=True)

# Step 2, run the model registration script
register_step = PythonScriptStep(name="Register Model",
                                 source_directory=experiment_folder,
                                 script_name="register.py",
                                 arguments=['--model_folder', model_folder],
                                 inputs=[model_folder],
                                 compute_target=pipeline_cluster,
                                 runconfig=pipeline_run_config,
                                 allow_reuse=True)
Ejemplo n.º 13
0
#######################################################################################################

est = TensorFlow(source_directory='./scripts/train',
                 compute_target=GPU_compute_target,
                 entry_script="estimator_training.py",
                 pip_packages=[
                     'keras<=2.3.1', 'matplotlib', 'opencv-python',
                     'azure-storage-blob==2.1.0', 'tensorflow-gpu==2.0.0'
                 ],
                 conda_packages=['scikit-learn==0.22.1'],
                 use_gpu=True)

est_step = EstimatorStep(name="Estimator_Train",
                         estimator=est,
                         estimator_entry_script_arguments=[
                             '--PreProcessingData', PreProcessingData
                         ],
                         inputs=[PreProcessingData],
                         runconfig_pipeline_params=None,
                         compute_target=GPU_compute_target)

#######################################################################################################

register_step = PythonScriptStep(name="register_step",
                                 script_name="estimator_register.py",
                                 runconfig=run_config_user_managed,
                                 source_directory='./scripts/register',
                                 arguments=['--ModelData', ModelData],
                                 outputs=[ModelData],
                                 compute_target=GPU_compute_target)

#######################################################################################################
Ejemplo n.º 14
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    data_prep_input = Dataset.get_by_name(workspace=workspace,
                                          name=data_prep_settings.get(
                                              "dataset_input_name",
                                              None)).as_named_input(
                                                  data_prep_settings.get(
                                                      "dataset_input_name",
                                                      None)).as_mount()
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []),
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ###############################################
    ### Creating data model train Pipeline Step ###
    ###############################################

    # Load settings
    print("Loading settings")
    model_train_step_path = os.path.join("steps", "model_train")
    with open(os.path.join(model_train_step_path, "step.json")) as f:
        model_train_settings = json.load(f)
    hyperparameter_sampling_settings = model_train_settings.get(
        "hyperparameter_sampling", {})

    # Setup datasets of first step
    print("Setting up datasets")
    model_train_input = data_prep_output.as_named_input(
        name=model_train_settings.get("dataset_input_name", None))
    model_train_output = PipelineData(
        name=model_train_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=model_train_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #model_train_output.register(
    #    name=model_train_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    model_train_dependencies = CondaDependencies.create(
        pip_packages=model_train_settings.get("pip_packages", []),
        conda_packages=model_train_settings.get("conda_packages", []),
        python_version=model_train_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    model_train_run_config = RunConfiguration(
        conda_dependencies=model_train_dependencies,
        framework=model_train_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    model_train_compute_target = ComputeTarget(workspace=workspace,
                                               name=model_train_settings.get(
                                                   "compute_target_name",
                                                   None))

    # Create distributed training backend
    print("Creating distributed training backend")
    distributed_training_backend = get_distributed_backend(
        backend_name=model_train_settings.get("distributed_backend", None))

    # Create Estimator for Training
    print("Creating Estimator for training")
    model_train_estimator = Estimator(
        source_directory=model_train_step_path,
        entry_script=model_train_settings.get("script_name", None),
        environment_variables=model_train_settings.get("parameters", None),
        compute_target=model_train_compute_target,
        node_count=model_train_settings.get("node_count", None),
        distributed_training=distributed_training_backend,
        conda_packages=model_train_settings.get("conda_packages", None),
        pip_packages=model_train_settings.get("pip_packages", None),
    )

    try:
        # Create parameter sampling
        print("Creating Parameter Sampling")
        parameter_dict = {}
        parameters = hyperparameter_sampling_settings.get(
            "parameters",
            {}) if "parameters" in hyperparameter_sampling_settings else {}
        for parameter_name, parameter_details in parameters.items():
            parameter_distr = get_parameter_distribution(
                distribution=parameter_details.get("distribution", None),
                **parameter_details.get("settings", {}))
            parameter_dict[f"--{parameter_name}"] = parameter_distr
        model_train_ps = get_parameter_sampling(
            sampling_method=hyperparameter_sampling_settings.get(
                "method", None),
            parameter_dict=parameter_dict)

        # Get Policy definition
        policy_settings = hyperparameter_sampling_settings.get("policy", {})
        kwargs = {
            key: value
            for key, value in policy_settings.items() if key not in
            ["policy_method", "evaluation_interval", "delay_evaluation"]
        }

        # Create termination policy
        print("Creating early termination policy")
        model_train_policy = get_policy(
            policy_method=policy_settings.get("method", ""),
            evaluation_interval=policy_settings.get("evaluation_interval",
                                                    None),
            delay_evaluation=policy_settings.get("delay_evaluation", None),
            **kwargs)

        # Create HyperDriveConfig
        print("Creating HyperDriveConfig")
        model_train_hyperdrive_config = HyperDriveConfig(
            estimator=model_train_estimator,
            hyperparameter_sampling=model_train_ps,
            policy=model_train_policy,
            primary_metric_name=hyperparameter_sampling_settings.get(
                "primary_metric", None),
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE
            if "min" in hyperparameter_sampling_settings.get(
                "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=hyperparameter_sampling_settings.get(
                "max_total_runs", 1),
            max_concurrent_runs=hyperparameter_sampling_settings.get(
                "max_concurrent_runs", 1),
            max_duration_minutes=hyperparameter_sampling_settings.get(
                "max_duration_minutes", None))

        # Create HyperDriveStep
        print("Creating HyperDriveStep")
        model_train = HyperDriveStep(
            name=model_train_settings.get("step_name", None),
            hyperdrive_config=model_train_hyperdrive_config,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))
    except:
        print("Not all required parameters specified for HyperDrive step")

        # Create EstimatorStep
        print("Creating EstimatorStep")
        model_train = EstimatorStep(
            name=model_train_settings.get("step_name", None),
            estimator=model_train_estimator,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            compute_target=model_train_compute_target,
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[model_train],
        description="Training Pipeline",
    )

    # Validate pipeline
    print("Validating pipeline")
    pipeline.validate()

    return pipeline
Ejemplo n.º 15
0
train = Estimator(source_directory='.',
                  compute_target=compute,
                  entry_script='train.py',
                  use_gpu=True,
                  pip_requirements_file='requirements-dataprepandtraining.txt')

trainStep = EstimatorStep(
    name='Model Training',
    estimator=train,
    estimator_entry_script_arguments=[
        "--source_path",
        seer_tfrecords,
        "--target_path",
        seer_training,
        "--epochs",
        5,  # Consider transfer learning. See line 111 in train.py file.
        "--batch",
        10,
        "--lr",
        0.001
    ],
    inputs=[seer_tfrecords],
    outputs=[seer_training],
    compute_target=compute)

## Register Model Step ##
# Once training is complete, register.py registers the model with AML #

# Configuration for registration step #
registerEnvironment = Environment.from_pip_requirements(
    'registerenv', 'requirements-registration.txt')
Ejemplo n.º 16
0
    print('Error while retrieving compute', e)
    sys.exit(-1)


################################
# If you want to use datastore
################################
# from azureml.core import Datastore
# from azureml.data.data_reference import DataReference
# from azureml.pipeline.core import PipelineData

# def_blob_store = Datastore(ws, "workspaceblobstore")

# input_data = DataReference(
#     datastore=def_blob_store,
#     data_reference_name="input_data",
#     path_on_datastore="20newsgroups/20news.pkl")

# output = PipelineData("output", datastore=def_blob_store)


est_step = EstimatorStep(name="Estimator_Train", 
                         estimator=est, 
                         estimator_entry_script_arguments=["--datadir", input_data, "--output", output],
                         runconfig_pipeline_params=None, 
                         inputs=[input_data], 
                         outputs=[output], 
                         compute_target=compute_target)

pipeline = Pipeline(workspace=ws, steps=[est_step])
pipeline_run = experiment.submit(pipeline)
Ejemplo n.º 17
0
]

est = Estimator(source_directory='./local_scripts2/',
                entry_script='2-train.py',
                pip_packages=pip_packages,
                compute_target=compute_target)

# 2.C) Create the EstimatorStep object
from azureml.pipeline.steps import EstimatorStep

train_step = EstimatorStep(
    name="2: Training",
    estimator=est,
    estimator_entry_script_arguments=[
        "--reg", 0.8, "--datapreparation_output", datapreparation_output,
        "--datatrain_output", datatrain_output, "--is_directory",
        'aaa' if is_directory else ''
    ],  #  All non-empty strings have a True boolean value],
    inputs=[datapreparation_output],
    outputs=[datatrain_output],
    compute_target=compute_target)

#
# ***************** CREATE PythonScriptStep3: MODEL REGISTRATION **************************
#
# 3.A) Create PipelineData Object modelregistration_output
from azureml.pipeline.core import PipelineData
is_directory = False  # it's a file where we save the details of the registered model
default_datastore = ws.get_default_datastore()
modelregistration_output = PipelineData('modelregistration_output3',
                                        datastore=default_datastore,
Ejemplo n.º 18
0
## Training Step ##
# train.py does the training based on the processed data #

seer_training = PipelineData("train", datastore=datastore, is_directory=True)

train = Estimator(source_directory='.',
                  compute_target=compute,
                  entry_script='train.py',
                  pip_requirements_file='requirements-dataprepandtraining.txt')

trainStep = EstimatorStep(name='Model Training',
                          estimator=train,
                          estimator_entry_script_arguments=[
                              "--source_path", seer_tfrecords, "--target_path",
                              seer_training, "--epochs", 5, "--batch", 10,
                              "--lr", 0.001
                          ],
                          inputs=[seer_tfrecords],
                          outputs=[seer_training],
                          compute_target=compute)

## Register Model Step ##
# Once training is complete, register.py registers the model with AML #

# Configuration for registration step #
registerEnvironment = Environment.from_pip_requirements(
    'registerenv', 'requirements-registration.txt')
registerRunConfig = RunConfiguration()
registerRunConfig.environment = registerEnvironment

seer_model = PipelineData("model", datastore=datastore, is_directory=True)
Ejemplo n.º 19
0
]

# Create the tensorflow Estimator
trainEstimator = PyTorch(
    source_directory = script_folder,
    compute_target = cluster,
    entry_script = "steps/train.py", 
    use_gpu = True,
    framework_version='1.3'
)

# Create a pipeline step with the TensorFlow Estimator
trainOnGpuStep = EstimatorStep(
    name='Train Estimator Step',
    estimator=trainEstimator,
    inputs=[training_data_location],
    outputs=[model],
    compute_target=cluster,
    estimator_entry_script_arguments = estimator_script_params
) 

## Register Model Step ##
# Once training is complete, register.py registers the model with AML #

# Configuration for registration step #
registerModelStep = PythonScriptStep(
    name="Register model in Model Management",
    script_name="steps/register.py",
    compute_target=cluster,
    inputs=[model],
    arguments=['--model_name', model_name,
                '--model_assets_path', model
def main():
    e = Env()
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    environment = get_environment(
        aml_workspace, e.aml_env_name, create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    run_config.environment.environment_variables["DATASTORE_NAME"] \
        = datastore_name

    dataset_name = e.dataset_name
    file_name = e.file_name
    datastore = Datastore.get(aml_workspace, datastore_name)

    if (dataset_name not in aml_workspace.datasets):
        raise Exception("Could not find dataset at \"%s\"." % dataset_name)
    else:
        dataset = Dataset.get_by_name(aml_workspace, name=dataset_name)
        dataset.download(target_path='.', overwrite=True)
        datastore.upload_files([file_name],
                               target_path=dataset_name,
                               overwrite=True)

    raw_data_file = DataReference(datastore=datastore,
                                  data_reference_name="Raw_Data_File",
                                  path_on_datastore=dataset_name + '/'
                                  + file_name)

    clean_data_file = PipelineParameter(name="clean_data_file",
                                        default_value="/clean_data.csv")
    clean_data_folder = PipelineData("clean_data_folder",
                                     datastore=datastore)

    prepDataStep = PythonScriptStep(name="Prepare Data",
                                    source_directory=e.sources_directory_train,
                                    script_name=e.data_prep_script_path,
                                    arguments=["--raw_data_file",
                                               raw_data_file,
                                               "--clean_data_folder",
                                               clean_data_folder,
                                               "--clean_data_file",
                                               clean_data_file],
                                    inputs=[raw_data_file],
                                    outputs=[clean_data_folder],
                                    compute_target=aml_compute,
                                    allow_reuse=False)

    print("Step Prepare Data created")

    new_model_file = PipelineParameter(name="new_model_file ",
                                       default_value='/' + e.model_name
                                       + '.pkl')
    new_model_folder = PipelineData("new_model_folder", datastore=datastore)
    est = SKLearn(source_directory=e.sources_directory_train,
                  entry_script=e.train_script_path,
                  pip_packages=['azureml-sdk', 'scikit-learn==0.20.3',
                                'azureml-dataprep[pandas,fuse]>=1.1.14'],
                  compute_target=aml_compute)

    trainingStep = EstimatorStep(
        name="Model Training",
        estimator=est,
        estimator_entry_script_arguments=["--clean_data_folder",
                                          clean_data_folder,
                                          "--new_model_folder",
                                          new_model_folder,
                                          "--clean_data_file",
                                          clean_data_file.default_value,
                                          "--new_model_file",
                                          new_model_file.default_value],
        runconfig_pipeline_params=None,
        inputs=[clean_data_folder],
        outputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Train created")

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)

    evaluateStep = PythonScriptStep(
        name="Evaluate Model",
        source_directory=e.sources_directory_train,
        script_name=e.evaluate_script_path,
        arguments=["--model_name", model_name_param],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Evaluate created")

    registerStep = PythonScriptStep(
        name="Register Model",
        source_directory=e.sources_directory_train,
        script_name=e.register_script_path,
        arguments=["--new_model_folder", new_model_folder,
                   "--new_model_file", new_model_file,
                   "--model_name", model_name_param],
        inputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Register created")

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        trainingStep.run_after(prepDataStep)
        evaluateStep.run_after(trainingStep)
        registerStep.run_after(evaluateStep)
    else:
        print("Exclude evaluation step and directly run register step.")
        trainingStep.run_after(prepDataStep)
        registerStep.run_after(trainingStep)

    pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
    pipeline.validate()
    print("Pipeline is built")

    pipeline._set_experiment_name
    published_pipeline = pipeline.publish(
        name=e.pipeline_name,
        description="Predict Employee Retention Model training pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Ejemplo n.º 21
0
# Create a PipelineData (Data Reference) for the model folder
model_folder = PipelineData("model_folder",
                            datastore=ws.get_default_datastore())

estimator = Estimator(source_directory=experiment_folder,
                      compute_target=gpu_cluster,
                      environment_definition=pipeline_run_config.environment,
                      entry_script='train_diabetes.py')

#Step 1
train_step = EstimatorStep(name="Train Model",
                           estimator=estimator,
                           estimator_entry_script_arguments=[
                               '--regularization', 0.1, '--output_folder',
                               model_folder
                           ],
                           inputs=[diabetes_ds.as_named_input('diabetes')],
                           outputs=[model_folder],
                           compute_target=gpu_cluster,
                           allow_reuse=False)

# Step 2, run the model registration script
register_step = PythonScriptStep(name="Register Model",
                                 source_directory=experiment_folder,
                                 script_name="register_diabetes.py",
                                 arguments=['--model_folder', model_folder],
                                 inputs=[model_folder],
                                 compute_target=gpu_cluster,
                                 runconfig=pipeline_run_config,
                                 allow_reuse=False)
    source_directory="preprocess",
    runconfig=run_config,
    allow_reuse=True,
)

estimator_step = EstimatorStep(
    name="Train Model",
    estimator=Estimator(
        source_directory="src/train",
        entry_script="train_pipeline.py",
        compute_target=compute_target,
        environment_definition=freezer_environment,
    ),
    estimator_entry_script_arguments=[
        "--input",
        output_df_nested,
        "--n_estimators",
        n_estimators_param,
        "--train_data_split",
        train_data_split_param,
    ],
    runconfig_pipeline_params=None,
    inputs=[output_df_nested],
    compute_target=compute_target,
    allow_reuse=True,
)

deploy_step = PythonScriptStep(
    name="Deploy Model",
    script_name="deploy.py",
    arguments=[
Ejemplo n.º 23
0
print("Blobstore's name: {}".format(def_blob_store.name))

blob_output_data = DataReference(datastore=def_blob_store,
                                 data_reference_name="data",
                                 path_on_datastore="data")
print("DataReference object created")

est = Estimator(source_directory='.',
                compute_target=ws.compute_targets['cpu'],
                entry_script='azureml-issues.py',
                pip_packages=['azure-devops', 'pandas'])

data_processing = EstimatorStep(estimator=est,
                                estimator_entry_script_arguments=[
                                    '--data_path', blob_output_data,
                                    '--analyze', '--load_open', '--load_closed'
                                ],
                                inputs=[blob_output_data],
                                compute_target=ws.compute_targets['cpu'],
                                allow_reuse=False)

pipeline = Pipeline(workspace=ws, steps=[data_processing])
print("Pipeline is built")

pipeline.validate()
print("Simple validation complete")

pipeline_run = Experiment(ws, 'issues_pipeline').submit(pipeline)
print("Pipeline is submitted for execution")

published_pipeline = pipeline.publish(
    name="Issues_Stats",
Ejemplo n.º 24
0
bootstrap_args = [r_script]

estimator = Estimator(source_directory='src',
                      entry_script='bootstrapper.py',
                      compute_target=aml_compute_target,
                      custom_docker_image=acr_image,
                      image_registry_details=acr_details,
                      user_managed=True)

inputs = []

step = EstimatorStep(name='execute-r',
                     estimator=estimator,
                     estimator_entry_script_arguments=bootstrap_args,
                     inputs=inputs,
                     outputs=None,
                     compute_target=aml_compute_target,
                     allow_reuse=False)

aml_pipeline = AmlPipeline(workspace=aml_workspace,
                           steps=AmlStepSequence([step]),
                           description='Run R Workloads')

published_pipeline = aml_pipeline.publish(description='Execute R Workload',
                                          name='pipeline-r')

aml_run = published_pipeline.submit(workspace=aml_workspace,
                                    experiment_name=aml_experiment_name)

if (aml_run):
preprocessing_est = SKLearn(
    source_directory='010-preprocessing',
    compute_target=cpu_cluster,
    entry_script='dataprep.py',
    conda_packages=['pandas'],
    pip_packages=['fastavro'],
)

output = PipelineData("output", datastore=telemetry_ds)
preprocessing_step = EstimatorStep(
    name="Preprocessing_Train",
    estimator=preprocessing_est,
    estimator_entry_script_arguments=[
        "--data_dir", input_data, "--output_data_dir", output
    ],
    inputs=[input_data],
    outputs=[output],
    compute_target=cpu_cluster,
    allow_reuse=True,
)

pytorch_est = PyTorch(
    source_directory='020-ann',
    compute_target=cpu_cluster,
    entry_script='pytorch_train.py',
    use_gpu=False,
    framework_version='1.1',
    conda_packages=['pandas'],
)
Ejemplo n.º 26
0
        source_directory=script_folder,
        compute_target=compute_target,
        environment_definition=pipeline_run_config.environment,
        entry_script='train.py')

    model_candidate_folder = PipelineData('model_candidate_folder',
                                          datastore=data_store)

    # Step to run an estimator
    train_step = EstimatorStep(
        name='Train model',
        estimator=estimator_train,
        compute_target=compute_target,
        # Specify PipelineData as input
        inputs=[input_data],
        outputs=[model_candidate_folder],
        # Pass as data reference to estimator script
        estimator_entry_script_arguments=[
            '--input_data', input_data, '--model_candidate_folder',
            model_candidate_folder, '--mode', pipeline_mode_param
        ],
        allow_reuse=False)

    estimator_evaluate = Estimator(
        source_directory=script_folder,
        compute_target=compute_target,
        environment_definition=pipeline_run_config.environment,
        entry_script='eval_model.py')

    validated_model_folder = PipelineData('validated_model_folder',
                                          datastore=data_store)
Ejemplo n.º 27
0
                                               min_node_count=None,
                                               timeout_in_minutes=0)

    script_params = {}
    estimator = PyTorch(source_directory='./',
                        script_params=script_params,
                        compute_target=compute_target_gpu,
                        entry_script='train.py',
                        use_gpu=True,
                        pip_packages=[],
                        framework_version='1.2')

    est_step = EstimatorStep(name="Train_Step",
                             estimator=estimator,
                             estimator_entry_script_arguments=auth_params,
                             runconfig_pipeline_params=None,
                             inputs=[],
                             outputs=[],
                             compute_target=compute_target_gpu)
    est_step.run_after(process_step)

    #     step 3

    pipeline = Pipeline(workspace=ws, steps=[process_step, est_step])
    pipeline_run_first = Experiment(
        ws, project_config['experiment_name']).submit(pipeline)
    #https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipelinerun?view=azure-ml-py
    pipeline_run_first.wait_for_completion(show_output=True,
                                           timeout_seconds=9223372036854775807,
                                           raise_on_error=True)
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")
    build_id = os.environ.get("BUILD_BUILDID")
    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
    data_path = os.environ.get("DATA_PATH_DATASTORE")
    model_data_path = os.environ.get("MODEL_DATA_PATH_DATASTORE")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, compute_name, vm_size)
    if aml_compute is not None:
        print(aml_compute)

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    release_id = PipelineParameter(name="release_id", default_value="0")

    ds = aml_workspace.get_default_datastore()

    dataref_folder = ds.path(data_path).as_mount()
    model_dataref = ds.path(model_data_path).as_mount()

    # NEED those two folders mounted on datastore and env variables specified in variable groups

    #ds.upload(src_dir='./VOCdevkit', target_path='VOCdevkit', overwrite=True, show_progress=True)
    #ds.upload(src_dir='./model_data', target_path='VOCmodel_data', overwrite=True, show_progress=True)

    yoloEstimator = TensorFlow(
        source_directory=sources_directory_train + '/training',
        compute_target=aml_compute,
        entry_script=train_script_path,
        pip_packages=[
            'keras', 'pillow', 'matplotlib', 'onnxmltools', 'keras2onnx==1.5.1'
        ],  # recent versions of keras2onnx give conversion issues 
        use_gpu=True,
        framework_version='1.13')

    train_step = EstimatorStep(name="Train & Convert Model",
                               estimator=yoloEstimator,
                               estimator_entry_script_arguments=[
                                   "--release_id", release_id, "--model_name",
                                   model_name, "--data_folder", dataref_folder,
                                   "--model_path", model_dataref
                               ],
                               runconfig_pipeline_params=None,
                               inputs=[dataref_folder, model_dataref],
                               compute_target=aml_compute,
                               allow_reuse=False)
    print("Step Train & Convert created")

    train_pipeline = Pipeline(workspace=aml_workspace, steps=[train_step])
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name=pipeline_name,
        description="Model training/retraining pipeline",
        version=build_id)
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Ejemplo n.º 29
0
train_step = EstimatorStep(
    name="Training Step",
    estimator=estimator,
    estimator_entry_script_arguments=[
        "--model_name_or_path",
        model_name_param,
        "--task",
        task_param,
        "--max_seq_length",
        max_seq_len_param,
        "--max_epochs",
        max_epochs_param,
        "--learning_rate",
        learning_rate_param,
        "--seed",
        seed_param,
        "--gpus",
        num_gpus_param,
        "--num_workers",
        num_workers_param,
        "--train_batch_size",
        train_batch_size_param,
        "--eval_batch_size",
        eval_batch_size_param,
        "--output_dir",
        "./outputs",
        "--do_train",
        "--do_predict",
    ],
    inputs=[prepared_dataset.as_mount()],
    compute_target=compute_target,
)
Ejemplo n.º 30
0
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create a PipelineData (temporary Data Reference) for the model folder
model_folder = PipelineData("model_folder", datastore=ws.get_default_datastore())

estimator = Estimator(source_directory=experiment_folder,
                        compute_target = pipeline_cluster,
                        environment_definition=pipeline_run_config.environment,
                        entry_script='train_diabetes.py')

# Step 1, run the estimator to train the model
train_step = EstimatorStep(name = "Train Model",
                           estimator=estimator, 
                           estimator_entry_script_arguments=['--output_folder', model_folder],
                           inputs=[diabetes_ds.as_named_input('diabetes_train')],
                           outputs=[model_folder],
                           compute_target = pipeline_cluster,
                           allow_reuse = True)


 # Step 2, run the model registration script
register_step = PythonScriptStep(name = "Register Model",
                                source_directory = experiment_folder,
                                script_name = "register_diabetes.py",
                                arguments = ['--model_folder', model_folder],
                                inputs=[model_folder],
                                compute_target = pipeline_cluster,