# Define comfigs
# allowed arguments are: randomforest, sklearn, deeplearning
# randomforest will perform 1 run of randomforest fit
# sklearnmodels will fit 15 models from sklearn
# deeplearning will fit a neural network with pytorch
models = 'randomforest'
data_local = False
# if data_local is true, subset is alwats true
subset = False
# hyperdrive only works with deeplearning
hyperdrive = False

# If deep learning define hyperparameters
# Set parameters for search
param_sampling = BayesianParameterSampling({
    "learning_rate": uniform(0.05, 0.1),
    "num_epochs": choice(5, 10, 15),
    "batch_size": choice(150, 200),
    "hidden_size": choice(50, 100)
})

# load Azure ML workspace
workspace = Workspace.from_config(auth=AzureCliAuthentication())

if subset is True:
    # define data set names
    input_name_train = 'newsgroups_subset_train'
    input_name_test = 'newsgroups_subset_test'
    filepath = "environments/sklearn_subset/RunConfig/runconfig_subset.yml"
else:
    input_name_train = 'newsgroups_train'
# In[62]:


from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
from azureml.train.hyperdrive.parameter_expressions import choice
ps = RandomParameterSampling( {
                                "--C": uniform(0.1,1),
                                "--max_iter": choice(50,100,150,200)
                                })

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = '.', entry_script = 'train.py', compute_target =  cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling = ps,                                    
                                     primary_metric_name = 'Accuracy',                                    
Example #3
0
datadrift_name = 'data_drift_report'
datadriftreport = PipelineData(name='data_drift_report',
                               datastore=datastore,
                               pipeline_output_name=datadrift_name)

datadrift_subset_name = 'data_drift_report_subset'
datadriftreportsubset = PipelineData(
    name='data_drift_report_subset',
    datastore=datastore,
    pipeline_output_name=datadrift_subset_name)

# Set parameters for search
param_sampling = BayesianParameterSampling({
    "learning_rate":
    uniform(10e-6, 1e0),
    "num_epochs":
    choice(10, 20),
    "batch_size":
    choice(10, 20, 50, 100, 200, 300, 500, 1000),
    "hidden_size":
    choice(300, 400)
})

# LOAD ALL SCRIPT PARAMETERS FOR EVERY STEP IN PIPELINE
script_params_data_validation = [
    '--data_folder_train',
    dataset_train.as_named_input('train').as_mount(), '--data_folder_test',
    dataset_test.as_named_input('test').as_mount(), '--local', 'no',
    '--output_train', train_validated, '--output_test', test_validated,
    '--data_drift_report', datadriftreport