コード例 #1
0
    def create_hyperdrive_trainer(self, estimator, hd_dict, search_type,
                                  metric_name, maximize_metric,
                                  early_term_policy, max_total_runs,
                                  max_concurrent_runs, max_minutes):

        from azureml.train.hyperdrive import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling

        if search_type == "random":
            ps = RandomParameterSampling(hd_dict)
        elif search_type == "grid":
            ps = GridParameterSampling(hd_dict)
        elif search_type == "bayesian":
            ps = BayesianParameterSampling(hd_dict)
        else:
            errors.config_error(
                "Azure ML Hyperdrive search_type not supported: " +
                search_type)

        max_concurrent_runs = min(max_total_runs, max_concurrent_runs)

        from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

        trainer = HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=ps,
            policy=early_term_policy,
            primary_metric_name=metric_name,
            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE
            if maximize_metric else PrimaryMetricGoal.MINIMIZE,
            max_total_runs=max_total_runs,
            max_concurrent_runs=max_concurrent_runs,
            max_duration_minutes=max_minutes)

        return trainer
コード例 #2
0
    def get_parameter_search_hyperdrive_config(
            self, estimator: Estimator) -> HyperDriveConfig:
        """
        Specify an Azure HyperDrive configuration.
        Further details are described in the tutorial
        https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters
        A reference is provided at https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train
        .hyperdrive?view=azure-ml-py
        :param estimator: The estimator (configured PyTorch environment) of the experiment.
        :return: An Azure HyperDrive run configuration (configured PyTorch environment).
        """
        parameter_space = {'l_rate': uniform(0.0005, 0.01)}

        param_sampling = RandomParameterSampling(parameter_space)

        # early terminate poorly performing runs
        early_termination_policy = BanditPolicy(slack_factor=0.15,
                                                evaluation_interval=1,
                                                delay_evaluation=10)

        return HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=param_sampling,
            policy=early_termination_policy,
            primary_metric_name=TrackedMetrics.Val_Loss.value,
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
            max_total_runs=10,
            max_concurrent_runs=2)
コード例 #3
0
def hyperparameter_tuning(ws,experiment):
    # Create and submit a Hyperdrive job
    cluster = ws.compute_targets[AML.compute_name]
    script_params={
        '--datastore-dir': ws.get_default_datastore().as_mount(),
    }
    tf_estimator = TensorFlow(source_directory='scripts',
                              compute_target=cluster,
                              entry_script='train.py',
                              script_params=script_params,
                              use_gpu=True)
    ps = RandomParameterSampling(
        {
            '--learning-rate': loguniform(-15, -3)
        }
    )
    early_termination_policy = BanditPolicy(slack_factor = 0.15, evaluation_interval=2)
    hyperdrive_run_config = HyperDriveRunConfig(estimator = tf_estimator, 
                                                hyperparameter_sampling = ps, 
                                                policy = early_termination_policy,
                                                primary_metric_name = "validation_accuracy",
                                                primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                                max_total_runs = 20,
                                                max_concurrent_runs = 4)

    hd_run = experiment.submit(hyperdrive_run_config)
    RunDetails(Run(experiment, hd_run.id)).show()
    return hd_run
コード例 #4
0
def _create_dummy_hyperdrive_param_search_config(
        estimator: Estimator) -> HyperDriveConfig:
    return HyperDriveConfig(estimator=estimator,
                            hyperparameter_sampling=RandomParameterSampling(
                                {'l_rate': uniform(0.0005, 0.01)}),
                            primary_metric_name=TrackedMetrics.Val_Loss.value,
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                            max_total_runs=HYPERDRIVE_TOTAL_RUNS)
コード例 #5
0
def main(epochs, iterations, compute_target, concurrent_runs):
    cli_auth = AzureCliAuthentication()

    experiment = Experiment.from_directory(".", auth=cli_auth)
    ws = experiment.workspace

    cluster = ws.compute_targets[compute_target]
    food_data = ws.datastores['food_images']

    script_arguments = {"--data-dir": food_data.as_mount(), "--epochs": epochs}

    tf_est = TensorFlow(source_directory=".",
                        entry_script='code/train/train.py',
                        script_params=script_arguments,
                        compute_target=cluster,
                        conda_packages=['pillow', 'pandas'],
                        pip_packages=['click', 'seaborn'],
                        use_docker=True,
                        use_gpu=True,
                        framework_version='1.13')

    # Run on subset of food categories
    tf_est.run_config.arguments.extend(
        ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio'])

    param_sampler = RandomParameterSampling({
        '--minibatch-size':
        choice(16, 32, 64),
        '--learning-rate':
        loguniform(-9, -6),
        '--optimizer':
        choice('rmsprop', 'adagrad', 'adam')
    })

    # Create Early Termination Policy
    etpolicy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

    # Create HyperDrive Run Configuration
    hyper_drive_config = HyperDriveConfig(
        estimator=tf_est,
        hyperparameter_sampling=param_sampler,
        policy=etpolicy,
        primary_metric_name='acc',
        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=iterations,
        max_concurrent_runs=concurrent_runs)

    # Submit the Hyperdrive Run
    print("Submitting Hyperdrive Run")
    hd_run = experiment.submit(hyper_drive_config)
    hd_run.wait_for_completion(raise_on_error=True, show_output=True)
    print("Finishing Run")
    best_run = hd_run.get_best_run_by_primary_metric()
    print(f'##vso[task.setvariable variable=run_id]{best_run.id}')
コード例 #6
0
ファイル: utils.py プロジェクト: soniaang/ProjectGemini
def get_parameter_sampling(sampling_method, parameter_dict):
    if "random" in sampling_method.lower():
        ps = RandomParameterSampling(
            parameter_space=parameter_dict
        )
    elif "grid" in sampling_method.lower():
        ps = GridParameterSampling(
            parameter_space=parameter_dict
        )
    elif "bayesian" in sampling_method.lower():
        ps = BayesianParameterSampling(
            parameter_space=parameter_dict
        )
    else:
        ps = None
        raise RunConfigurationException("Parameter Sampling Method not defined in settings. Please choose between \'random\', \'grid\' and \'bayesian\'")
    return ps
コード例 #7
0
ファイル: utils.py プロジェクト: zm-intern06/MLDevOps
def get_parameter_sampling(sampling_method, parameter_settings):
    parameter_dict = {}
    for parameter_name, parameter_setting in parameter_settings.items():
        parameter_distr = get_parameter_distribution(parameter_name,
                                                     parameter_setting)
        parameter_dict["--{}".format(parameter_name)] = parameter_distr

    if "random" in sampling_method:
        ps = RandomParameterSampling(parameter_dict)
    elif "grid" in sampling_method:
        ps = GridParameterSampling(parameter_dict)
    elif "bayesian" in sampling_method:
        ps = BayesianParameterSampling(parameter_dict)
    else:
        ps = None
        raise RunConfigurationException(
            "Parameter Sampling Method not defined in settings. Please choose between \'random\', \'grid\' and \'bayesian\'"
        )
    return ps
コード例 #8
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))

    base_dir = '.'
        
    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)
    
    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
    
    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except:# ComputeTargetException:
        print("creating new compute target")
        
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                                    max_nodes=4,
                                                                    idle_seconds_before_scaledown=1800)    
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
        
    # use get_status() to get a detailed status for the current cluster. 
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except: 
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                                    max_nodes=10,
                                                                    idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout. 
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster. 
    try:
        print(gpu_compute_target.get_status().serialize())
    except BaseException as e:
        print("Could not get status of compute target.")
        print(e)

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
    
    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    # DataReference to where video data is stored.
    video_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="video_data",
        path_on_datastore=os.path.join("prednet", "data", "video", dataset))
    print("DataReference object created")
        
    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py", 
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    print("video_decode step created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name='prepare_data',
        script_name="data_preparation.py", 
        arguments=["--input_data", raw_data, "--output_data", preprocessed_data],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    data_prep.run_after(video_decoding)

    print("data_prep step created")


    # configure access to ACR for pulling our custom docker image
    acr = ContainerRegistry()
    acr.address = config['acr_address']
    acr.username = config['acr_username']
    acr.password = config['acr_password']
    
    est = Estimator(source_directory=script_folder,
                    compute_target=gpu_compute_target,
                    entry_script='train.py', 
                    use_gpu=True,
                    node_count=1,
                    custom_docker_image = "wopauli_1.8-gpu:1",
                    image_registry_details=acr,
                    user_managed=True
                    )

    ps = RandomParameterSampling(
        {
            '--batch_size': choice(1, 2, 4, 8),
            '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
            '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
            '--learning_rate': loguniform(-6, -1),
            '--lr_decay': loguniform(-9, -1),
            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
            '--transfer_learning': choice("True", "False")
        }
    )

    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10)

    hdc = HyperDriveConfig(estimator=est, 
                            hyperparameter_sampling=ps, 
                            policy=policy, 
                            primary_metric_name='val_loss', 
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                            max_total_runs=10,
                            max_concurrent_runs=5, 
                            max_duration_minutes=60*6
                            )

    hd_step = HyperDriveStep(
        name="train_w_hyperdrive",
        hyperdrive_run_config=hdc,
        estimator_entry_script_arguments=[
            '--data-folder', preprocessed_data, 
            '--remote_execution',
            '--dataset', dataset
            ],
        inputs=[preprocessed_data],
        metrics_output = data_metrics,
        allow_reuse=True
    )
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=cpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.']
    )
    registration_step.run_after(hd_step)

    pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step])
    print ("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete") 

    pipeline_name = 'prednet_' + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)
    

    schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch",
                            pipeline_id=published_pipeline.id, 
                            experiment_name=pipeline_name,
                            datastore=def_blob_store,
                            wait_for_provisioning=True,
                            description="Datastore scheduler for Pipeline" + pipeline_name,
                            path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'),
                            polling_interval=1
                            )

    return pipeline_name
コード例 #9
0
                    'keras', 'tensorflow', 'tensorflow-gpu', 'matplotlib',
                    'pillow', 'six', 'numpy', 'azureml-sdk', 'tqdm'
                ],
                conda_packages=['cudatoolkit=10.0.130'],
                entry_script='kd_squeezenet.py',
                use_gpu=True,
                node_count=1)

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.pipeline.steps import HyperDriveStep
from azureml.train.hyperdrive import choice, loguniform, uniform

ps = RandomParameterSampling({
    '--learning_rate': uniform(1e-3, 2e-2),
    '--momentum': uniform(.1, .95),
    '--weight_decay': loguniform(-5, -3),
    '--temperature': uniform(1, 9),
    # '--lambda_const': uniform(.1, .3),
    '--transfer_learning': choice("True", "False")
})

policy = BanditPolicy(evaluation_interval=2,
                      slack_factor=0.1,
                      delay_evaluation=10)

hdc = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name='val_loss',
    primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
    max_total_runs=5,  #100,
コード例 #10
0
args = [
    '--input-data', ds.as_named_input('train_data'),
    #'--num-topics', 10,
    '--chunksize', 2000,
    '--passes', 20,
    '--iterations', 400
]

src = ScriptRunConfig(source_directory="./topicmodel",
                      script='train.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)

param_sampling = RandomParameterSampling({
    "--num-topics": choice(5, 10, 15, 20)
})

# Submit experiment

hd = HyperDriveConfig(run_config=src,
                      hyperparameter_sampling=param_sampling,
                      primary_metric_name="c_v",
                      primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                      max_total_runs=100,
                      max_concurrent_runs=4)

run = exp.submit(config=hd)

run.wait_for_completion(show_output=False)
コード例 #11
0
 def get_param_sampling(self):
     return RandomParameterSampling(PARAMETER_SAMPLING)
コード例 #12
0
ファイル: trainseg.py プロジェクト: balakreshnan/visionautoml
parameter_space = {
    'model_name': choice('maskrcnn_resnet50_fpn'),
    'learning_rate': uniform(0.0001, 0.001),
    #'warmup_cosine_lr_warmup_epochs': choice(0, 3),
    'optimizer': choice('sgd', 'adam', 'adamw'),
    'min_size': choice(600, 800)
}

tuning_settings = {
    'iterations':
    20,
    'max_concurrent_iterations':
    4,
    'hyperparameter_sampling':
    RandomParameterSampling(parameter_space),
    'policy':
    BanditPolicy(evaluation_interval=2, slack_factor=0.2, delay_evaluation=6)
}

automl_image_config = AutoMLImageConfig(
    task='image-instance-segmentation',
    compute_target=compute_target,
    training_data=training_dataset,
    validation_data=validation_dataset,
    primary_metric='mean_average_precision',
    **tuning_settings)

automl_image_run = experiment.submit(automl_image_config)

automl_image_run.wait_for_completion(wait_post_processing=True)
コード例 #13
0
print()
print('##################################################')
print('Batch AI run...')
print('##################################################')
print()

# start the job
run = sk_est.fit()
print(helpers.get_run_history_url(run))
run.wait_for_completion(show_output=True)

print('configure hyperdrive.')

# parameter space to sweep over
ps = RandomParameterSampling({"alpha": uniform(0.0, 1.0)})

# early termniation policy
# check every 2 iterations and if the primary metric (epoch_val_acc) falls
# outside of the range of 10% of the best recorded run so far, terminate it.
etp = BanditPolicy(slack_factor=0.1, evaluation_interval=2)

# Hyperdrive run configuration
hrc = HyperDriveRunConfig(
    ".",
    estimator=sk_est,
    hyperparameter_sampling=ps,
    policy=etp,
    # metric to watch (for early termination)
    primary_metric_name='mse',
    # terminate if metric falls below threshold
コード例 #14
0
def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    hostname = socket.gethostname()
    if hostname == 'wopauliNC6':
        base_dir = '.'
    else:
        base_dir = '.'

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)

    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)

    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except ComputeTargetException:
        print("creating new compute target")

        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_D2_V2',
            max_nodes=4,
            idle_seconds_before_scaledown=1800)
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name,
                                                  provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except ComputeTargetException:
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6',
            max_nodes=5,
            idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name,
                                                  provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout.
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(gpu_compute_target.get_status().serialize())

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"],
                                      pip_packages=[
                                          "azure-storage-blob==1.5.0",
                                          "hickle==3.4.3", "requests==2.21.0",
                                          "sklearn", "pandas==0.24.2",
                                          "azureml-sdk==1.0.21",
                                          "numpy==1.16.2", "pillow==6.0.0"
                                      ])
    gpu_cd = CondaDependencies.create(pip_packages=[
        "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0",
        "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3",
        "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0",
        "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2"
    ])

    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
    gpu_compute_run_config.environment.docker.enabled = True
    gpu_compute_run_config.environment.docker.gpu_support = True
    gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
    gpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    video_data = DataReference(datastore=def_blob_store,
                               data_reference_name="video_data",
                               path_on_datastore=os.path.join(
                                   "prednet", "data", "video", dataset))

    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    print("DataReference object created")

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py",
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.'])
    print("video_decode created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(name='prepare_data',
                                 script_name="data_preparation.py",
                                 arguments=[
                                     "--input_data", raw_data, "--output_data",
                                     preprocessed_data
                                 ],
                                 inputs=[raw_data],
                                 outputs=[preprocessed_data],
                                 compute_target=cpu_compute_target,
                                 source_directory=script_folder,
                                 runconfig=cpu_compute_run_config,
                                 allow_reuse=True,
                                 hash_paths=['.'])
    data_prep.run_after(video_decoding)

    print("data_prep created")

    est = TensorFlow(source_directory=script_folder,
                     compute_target=gpu_compute_target,
                     pip_packages=[
                         'keras==2.0.8', 'theano', 'tensorflow==1.8.0',
                         'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod',
                         'hickle'
                     ],
                     entry_script='train.py',
                     use_gpu=True,
                     node_count=1)

    ps = RandomParameterSampling({
        '--batch_size':
        choice(2, 4, 8, 16),
        '--filter_sizes':
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        '--stack_sizes':
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),  #, "48, 96"),
        '--learning_rate':
        loguniform(-6, -1),
        '--lr_decay':
        loguniform(-9, -1),
        '--freeze_layers':
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2",
               "3"),
        '--transfer_learning':
        choice("True", "False")
    })

    policy = BanditPolicy(evaluation_interval=2,
                          slack_factor=0.1,
                          delay_evaluation=20)

    hdc = HyperDriveRunConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        policy=policy,
        primary_metric_name='val_loss',
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=5,  #100,
        max_concurrent_runs=5,  #10,
        max_duration_minutes=60 * 6)

    hd_step = HyperDriveStep(name="train_w_hyperdrive",
                             hyperdrive_run_config=hdc,
                             estimator_entry_script_arguments=[
                                 '--data-folder', preprocessed_data,
                                 '--remote_execution'
                             ],
                             inputs=[preprocessed_data],
                             metrics_output=data_metrics,
                             allow_reuse=True)
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=gpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.'])
    registration_step.run_after(hd_step)

    pipeline = Pipeline(
        workspace=ws,
        steps=[video_decoding, data_prep, hd_step, registration_step])
    print("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete")

    pipeline_name = 'prednet_' + dataset
    pipeline.publish(name=pipeline_name)

    return pipeline_name
コード例 #15
0
            ### Hyperparameters params
            if language == 'en':
                model_type = choice('roberta', 'bert', 'albert')
            elif language == 'de':
                model_type = choice('distilbert', 'bert', 'roberta')
            elif language == 'it' or language == 'es':
                model_type = choice('bert')
            elif language == 'fr':
                model_type = choice('camembert', 'bert')
            param_sampling = RandomParameterSampling({
                '--n_epochs':
                choice(3, 5, 10),
                '--learning_rate':
                choice(1e-5, 2e-5, 3e-5, 4e-5),
                '--model_type':
                model_type,
                '--max_seq_len':
                choice(128, 256),
                '--embeds_dropout':
                choice(0.1, 0.2, 0.3)
            })
            ## Termination policy
            early_termination_policy = BanditPolicy(slack_factor=0.1,
                                                    evaluation_interval=1,
                                                    delay_evaluation=3)
            ## Prepare HyperDrive Config
            hdc = HyperDriveConfig(
                estimator=est,
                hyperparameter_sampling=param_sampling,
                policy=early_termination_policy,
                primary_metric_name='f1macro',
コード例 #16
0
# Define Run Configuration
estimator = Estimator(
    entry_script='hypertrain.py',
    source_directory=os.path.dirname(os.path.realpath(__file__)),
    compute_target=workspace.compute_targets[cluster_name],
    pip_packages=[
        'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1',
        'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0'
    ])

# Set parameters for search
param_sampling = RandomParameterSampling({
    "max_depth":
    choice([100, 50, 20, 10]),
    "n_estimators":
    choice([50, 150, 200, 250]),
    "criterion":
    choice(['gini', 'entropy']),
    "min_samples_split":
    choice([2, 3, 4, 5])
})

# Define multi-run configuration
hyperdrive_run_config = HyperDriveConfig(
    estimator=estimator,
    hyperparameter_sampling=param_sampling,
    policy=None,
    primary_metric_name="accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=2,
    max_concurrent_runs=None)
コード例 #17
0
ファイル: tf2train.py プロジェクト: mindis/mlops
y_hat = np.argmax(pred, axis=1)

# print the first 30 labels and predictions
print('labels:  \t', y_test[:30])
print('predictions:\t', y_hat[:30])

print("Accuracy on the test set:", np.average(y_hat == y_test))

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

ps = RandomParameterSampling({
    '--batch-size':
    choice(32, 64, 128),
    '--first-layer-neurons':
    choice(16, 64, 128, 256, 512),
    '--second-layer-neurons':
    choice(16, 64, 256, 512),
    '--learning-rate':
    loguniform(-6, -1)
})

est = TensorFlow(source_directory=script_folder,
                 script_params={
                     '--data-folder':
                     dataset.as_named_input('mnist').as_mount()
                 },
                 compute_target=compute_target,
                 entry_script='tf_mnist2.py',
                 framework_version='2.0',
                 use_gpu=True,
                 pip_packages=['azureml-dataprep[pandas,fuse]'])
コード例 #18
0
                    compute_target=compute_target,
                    entry_script='pytorch_train.py',
                    use_gpu=True)

# Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities.
# Start a hyperparameter sweep

# First, we will define the hyperparameter space to sweep over. Since our training script uses a learning rate schedule to decay the learning rate every several epochs, let's tune the initial learning rate and the momentum parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (best_val_acc).

# Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the BanditPolicy, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our best_val_acc metric every epoch and evaluation_interval=1). Notice we will delay the first policy evaluation until after the first 10 epochs (delay_evaluation=10). Refer here for more information on the BanditPolicy and other policies available.

from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveRunConfig, BanditPolicy, PrimaryMetricGoal, uniform

param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'momentum': uniform(0.9, 0.99)
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,
                                            hyperparameter_sampling=param_sampling, 
                                            policy=early_termination_policy,
                                            primary_metric_name='best_val_acc',
                                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                            max_total_runs=8,
                                            max_concurrent_runs=4)

# Finally, lauch the hyperparameter tuning job.
コード例 #19
0
                 entry_script='train.py',
                 use_gpu=True,
                 node_count=1)

# run = exp.submit(est)

# print(run)

# run.wait_for_completion(show_output=True)

ps = RandomParameterSampling({
    '--batch_size':
    choice(2, 4, 8, 16),
    '--filter_sizes':
    choice("3 3 3", "4 4 4", "5 5 5"),
    '--stack_sizes':
    choice("48 96 192", "36 72 144", "12 24 48"),
    '--learning_rate':
    loguniform(-6, -1),
    '--lr_decay':
    loguniform(-9, -1)
})

policy = BanditPolicy(evaluation_interval=2,
                      slack_factor=0.1)  #, delay_evaluation=20)

hdc = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=ps,
                          policy=policy,
                          primary_metric_name='val_loss',
                          primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                          max_total_runs=5,
コード例 #20
0
    )

    # set up hyperdrive search space
    convert_base = lambda x: float(np.log(x))
    search_space = {
        "--learning_rate":
        hyperdrive.loguniform(
            convert_base(1e-6),
            convert_base(5e-2)),  # NB. loguniform on [exp(min), exp(max)]
        "--weight_decay":
        hyperdrive.uniform(5e-3, 15e-2),
        "--per_device_train_batch_size":
        hyperdrive.choice([16, 32]),
    }

    hyperparameter_sampling = RandomParameterSampling(search_space)

    policy = TruncationSelectionPolicy(truncation_percentage=50,
                                       evaluation_interval=2,
                                       delay_evaluation=0)

    hyperdrive_config = HyperDriveConfig(
        run_config=config,
        hyperparameter_sampling=hyperparameter_sampling,
        policy=policy,
        primary_metric_name="eval_matthews_correlation",
        primary_metric_goal=hyperdrive.PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=20,
        max_concurrent_runs=8,
    )
コード例 #21
0
     compute_target=compute_target,
     pip_packages=GeneralConfig.pip_packages,
     entry_script=PathsConfig.entry_script,
     use_gpu=True,
     custom_docker_image=settings["IMAGE_NAME"],
 )
 if GeneralConfig.hyperdrive:
     if GeneralConfig.architecture_type == "PretrainedResNet50":
         hyperparams_space = HyperdriveConfig.pretrained_resnet50_hyperparams_space
     else:
         raise NotImplementedError
     hyperparams_space_format = {
         parameter: choice(parameter_range)
         for parameter, parameter_range in hyperparams_space.items()
     }
     parameters_sampling = RandomParameterSampling(hyperparams_space_format)
     policy = BanditPolicy(
         evaluation_interval=HyperdriveConfig.evaluation_interval,
         slack_factor=HyperdriveConfig.slack_factor,
     )
     hdc = HyperDriveConfig(
         estimator=est,
         hyperparameter_sampling=parameters_sampling,
         policy=policy,
         primary_metric_name="Accuracy",
         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
         max_total_runs=HyperdriveConfig.max_total_runs,
         max_concurrent_runs=HyperdriveConfig.max_concurrent_runs,
     )
     run = exp.submit(hdc)
 else:
コード例 #22
0
                 '--batch_size': choice(16, 32, 64),
                 '--learning_rate': choice(0.01, 0.1, 1.0)
              }

param_sampling = GridParameterSampling(param_space)             


#random sampling
from azureml.train.hyperdrive import RandomParameterSampling, choice, normal

param_space = {
                 '--batch_size': choice(16, 32, 64),
                 '--learning_rate': normal(10, 3)
              }

param_sampling = RandomParameterSampling(param_space)


#Bayesian sampling
from azureml.train.hyperdrive import BayesianParameterSampling, choice, uniform

param_space = {
                 '--batch_size': choice(16, 32, 64),
                 '--learning_rate': uniform(0.5, 0.1)
              }

param_sampling = BayesianParameterSampling(param_space)


#bandit policy
from azureml.train.hyperdrive import BanditPolicy
コード例 #23
0
                    'tensorflow-gpu==1.12.0', 'matplotlib', 'horovod',
                    'hickle', 'pillow==5.1.0', 'six==1.11.0', 'numpy==1.14.5'
                ],
                entry_script='vanilla_squeezenet.py',
                use_gpu=True,
                node_count=1)

# run = exp.submit(est)

# print(run)

# run.wait_for_completion(show_output=True)

ps = RandomParameterSampling({
    '--learning_rate': loguniform(-4, -2),
    '--momentum': loguniform(-9, -1),
    '--weight_decay': loguniform(-3, -1),
    '--transfer_learning': choice("True", "False")
})

policy = BanditPolicy(evaluation_interval=2,
                      slack_factor=0.1,
                      delay_evaluation=10)

hdc = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=ps,
                          policy=policy,
                          primary_metric_name='val_loss',
                          primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                          max_total_runs=100,
                          max_concurrent_runs=5)