예제 #1
0
#
# I am a using a Bayesian sampling optimization to make the best use of the time, assumming it will quickly detect the strength of the boosting ensembling with quite high complexity grades.
#
# In addition I am iterating through several different, reasonable learning rates, **lrf**. As the (effective) learning rates for the neural network and the boosting algorithms vary strongly I am providing this hyperparameter as factor to a "reasonable" base value defined in the script itself. The effective values chosen will be stored in the output pickle files for reproducability.
#
# Also I am iterating through different **complexity** grades. In case of the neural network they define the width of the hidden neuron layers, in case of the boosting variant they define the depth and count of estimators.
#
# And last but not least - though this value only affects the neural network - I am trying two different **iterations** counts, 100, 200. About 100 runs are the minimum needed to nearly converge, more runs just still slightly improve the model.
#
# After several tries a max_total_runs value of **50** turned out to be a good compromise, the best result is usually achieved after about 30 runs. Our target metric is - as in the AutoML variant - the r2_score is this was originally also the goal for of the contest this dataset has been used to and a strong indicator for a dataset such as this with a quite huge variance within the label data.

# +
# Specify parameter sampler, usnig Baysesian sampling to quickly choose the most promising combinations
ps = BayesianParameterSampling({
    "--model":
    choice('linear', 'mlpregressor', 'gradientboosting'),
    "--lrf":
    choice(1.0, 0.1, 0.25, 0.5, 2.0),
    "--iterations":
    choice(100, 200),
    "--complexity":
    choice(1.0, 0.25, 0.5, 2.0)
})

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=script_path,
              entry_script=script_file,
              compute_target=compute_target)
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
from azureml.train.hyperdrive.parameter_expressions import choice
ps = RandomParameterSampling( {
                                "--C": uniform(0.1,1),
                                "--max_iter": choice(50,100,150,200)
                                })

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = '.', entry_script = 'train.py', compute_target =  cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling = ps,                                    
                                     primary_metric_name = 'Accuracy',                                    
                                     max_total_runs = 5,                                    
예제 #3
0
# allowed arguments are: randomforest, sklearn, deeplearning
# randomforest will perform 1 run of randomforest fit
# sklearnmodels will fit 15 models from sklearn
# deeplearning will fit a neural network with pytorch
models = 'randomforest'
data_local = False
# if data_local is true, subset is alwats true
subset = False
# hyperdrive only works with deeplearning
hyperdrive = False

# If deep learning define hyperparameters
# Set parameters for search
param_sampling = BayesianParameterSampling({
    "learning_rate": uniform(0.05, 0.1),
    "num_epochs": choice(5, 10, 15),
    "batch_size": choice(150, 200),
    "hidden_size": choice(50, 100)
})

# load Azure ML workspace
workspace = Workspace.from_config(auth=AzureCliAuthentication())

if subset is True:
    # define data set names
    input_name_train = 'newsgroups_subset_train'
    input_name_test = 'newsgroups_subset_test'
    filepath = "environments/sklearn_subset/RunConfig/runconfig_subset.yml"
else:
    input_name_train = 'newsgroups_train'
    input_name_test = 'newsgroups_test'
예제 #4
0
# tuning , ssampling of hyperparameters.

from azureml.train.hyperdrive.runconfig import HyperDriveRunConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice

param_sampling = RandomParameterSampling({
    "--kernel":
    choice('linear', 'rbf', 'poly', 'sigmoid'),
    "--penalty":
    choice(0.5, 1, 1.5)
})

hyperdrive_run_config = HyperDriveRunConfig(
    estimator=estimator,
    hyperparameter_sampling=param_sampling,
    primary_metric_name='Accuracy',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=12,
    max_concurrent_runs=4)

# launch the hyperparameter tuning job.

hyperdrive_run = experiment.submit(hyperdrive_run_config)

# monitor hyperdrive runs

RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_Completion(show_output=True)
assert (hyperdrive_run.get_status() == "Completed")
예제 #5
0
datadriftreport = PipelineData(name='data_drift_report',
                               datastore=datastore,
                               pipeline_output_name=datadrift_name)

datadrift_subset_name = 'data_drift_report_subset'
datadriftreportsubset = PipelineData(
    name='data_drift_report_subset',
    datastore=datastore,
    pipeline_output_name=datadrift_subset_name)

# Set parameters for search
param_sampling = BayesianParameterSampling({
    "learning_rate":
    uniform(10e-6, 1e0),
    "num_epochs":
    choice(10, 20),
    "batch_size":
    choice(10, 20, 50, 100, 200, 300, 500, 1000),
    "hidden_size":
    choice(300, 400)
})

# LOAD ALL SCRIPT PARAMETERS FOR EVERY STEP IN PIPELINE
script_params_data_validation = [
    '--data_folder_train',
    dataset_train.as_named_input('train').as_mount(), '--data_folder_test',
    dataset_test.as_named_input('test').as_mount(), '--local', 'no',
    '--output_train', train_validated, '--output_test', test_validated,
    '--data_drift_report', datadriftreport
]
# So far we have been putting in default hyperparameter values, but in practice we would need tune these values to optimize the performance. Azure Machine Learning service provides many methods for tuning hyperparameters using different strategies.
#
# The first step is to choose the parameter space that we want to search. We have a few choices to make here :
#
# - **Parameter Sampling Method**: This is how we select the combinations of parameters to sample. Azure Machine Learning service offers [RandomParameterSampling](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.randomparametersampling?view=azure-ml-py), [GridParameterSampling](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.gridparametersampling?view=azure-ml-py), and [BayesianParameterSampling](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.bayesianparametersampling?view=azure-ml-py). We will use the `GridParameterSampling` method.
# - **Parameters To Search**: We will be searching for optimal combinations of `learning_rate` and `num_epochs`.
# - **Parameter Expressions**: This defines the [functions that can be used to describe a hyperparameter search space](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.parameter_expressions?view=azure-ml-py), which can be discrete or continuous. We will be using a `discrete set of choices`.
#
# The following code allows us to define these options.

# %%
from azureml.train.hyperdrive import GridParameterSampling
from azureml.train.hyperdrive.parameter_expressions import choice

param_sampling = GridParameterSampling({
    '--learning_rate': choice(3e-5, 3e-4),
    '--num_epochs': choice(3, 4)
})

# %% [markdown]
# The next step is to a define how we want to measure our performance. We do so by specifying two classes:
#
# - **[PrimaryMetricGoal](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.primarymetricgoal?view=azure-ml-py)**: We want to `MAXIMIZE` the `val_accuracy` that is logged in our training script.
# - **[BanditPolicy](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.banditpolicy?view=azure-ml-py)**: A policy for early termination so that jobs which don't show promising results will stop automatically.

# %%
from azureml.train.hyperdrive import BanditPolicy
from azureml.train.hyperdrive import PrimaryMetricGoal

primary_metric_name = 'val_accuracy'
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE
예제 #7
0
def main(
        workspace=None,
        dataset_train_name=config.get_default_dataset_name("train"),
        dataset_validate_name=config.get_default_dataset_name("validate"),
):
    """
    Return HyperDriveConfig
    """
    if not workspace:
        workspace = package_utils.get_workspace()

    cluster_max_nodes = 4
    args = aml_compute.parse_args()
    args.cluster_max_nodes = cluster_max_nodes
    args.cluster_sku = "Standard_D2s_v3"
    args.cluster_vm_priority = "dedicated"
    compute_target = aml_compute.main(args)
    logger.info(msg="main",
                extra={"compute_target": compute_target.serialize()})

    environment = get_environment()
    logger.info(msg="main", extra={"environment": environment})

    run_config = RunConfiguration()
    run_config.target = compute_target
    run_config.environment = environment
    logger.info(msg="main", extra={"run_config": run_config})

    parameter_space = {
        "--hyperparameter-n_estimators": choice(range(15, 20 + 1, 5)),
        "--hyperparameter-criterion": choice(["gini", "entropy"]),
        "--hyperparameter-max_depth": choice(range(10, 15 + 1, 5)),
    }
    hyperparameter_sampling = GridParameterSampling(parameter_space)
    hyperparameter_sampling_number_of_runs = functools.reduce(
        operator.mul, [len(value[1][0]) for value in parameter_space.values()])

    train = Dataset.get_by_name(
        workspace=workspace,
        name=dataset_train_name,
    )
    validate = Dataset.get_by_name(
        workspace=workspace,
        name=dataset_validate_name,
    )

    arguments = [
        "--dataset-train-path",
        train.as_named_input("train").as_mount(),
        "--dataset-validate-path",
        validate.as_named_input("validate").as_mount(),
        "--hyperparameter-n_jobs",
        -1,
        "--hyperparameter-random_state",
        0,
    ]

    script_run_config = ScriptRunConfig(
        source_directory="nd00333/model/hyperdrive/train",
        script="train.py",
        arguments=arguments,
        run_config=run_config,
        compute_target=compute_target,
        environment=environment,
        max_run_duration_seconds=60 * 10,
    )

    # The GridParameterSampling is not an iterative process
    # and it won't profit from a termination policy.
    # On the contrary, a highly accurate randomly sampled model may follow an inaccurate model.
    # Therefore a sampling policy that won't terminate any runs is used.
    policy = BanditPolicy(evaluation_interval=1,
                          slack_factor=None,
                          slack_amount=1.0,
                          delay_evaluation=0)

    hd_config = HyperDriveConfig(
        hyperparameter_sampling=hyperparameter_sampling,
        primary_metric_name="norm_macro_recall",
        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=hyperparameter_sampling_number_of_runs,
        max_concurrent_runs=cluster_max_nodes,
        policy=policy,
        run_config=script_run_config,
    )

    return hd_config