# Define comfigs # allowed arguments are: randomforest, sklearn, deeplearning # randomforest will perform 1 run of randomforest fit # sklearnmodels will fit 15 models from sklearn # deeplearning will fit a neural network with pytorch models = 'randomforest' data_local = False # if data_local is true, subset is alwats true subset = False # hyperdrive only works with deeplearning hyperdrive = False # If deep learning define hyperparameters # Set parameters for search param_sampling = BayesianParameterSampling({ "learning_rate": uniform(0.05, 0.1), "num_epochs": choice(5, 10, 15), "batch_size": choice(150, 200), "hidden_size": choice(50, 100) }) # load Azure ML workspace workspace = Workspace.from_config(auth=AzureCliAuthentication()) if subset is True: # define data set names input_name_train = 'newsgroups_subset_train' input_name_test = 'newsgroups_subset_test' filepath = "environments/sklearn_subset/RunConfig/runconfig_subset.yml" else: input_name_train = 'newsgroups_train'
# In[62]: from azureml.widgets import RunDetails from azureml.train.sklearn import SKLearn from azureml.train.hyperdrive.run import PrimaryMetricGoal from azureml.train.hyperdrive.policy import BanditPolicy from azureml.train.hyperdrive.sampling import RandomParameterSampling from azureml.train.hyperdrive.runconfig import HyperDriveConfig from azureml.train.hyperdrive.parameter_expressions import uniform import os # Specify parameter sampler from azureml.train.hyperdrive.parameter_expressions import choice ps = RandomParameterSampling( { "--C": uniform(0.1,1), "--max_iter": choice(50,100,150,200) }) # Specify a Policy policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5) if "training" not in os.listdir(): os.mkdir("./training") # Create a SKLearn estimator for use with train.py est = SKLearn(source_directory = '.', entry_script = 'train.py', compute_target = cpu_cluster) # Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy. hyperdrive_config = HyperDriveConfig(hyperparameter_sampling = ps, primary_metric_name = 'Accuracy',
datadrift_name = 'data_drift_report' datadriftreport = PipelineData(name='data_drift_report', datastore=datastore, pipeline_output_name=datadrift_name) datadrift_subset_name = 'data_drift_report_subset' datadriftreportsubset = PipelineData( name='data_drift_report_subset', datastore=datastore, pipeline_output_name=datadrift_subset_name) # Set parameters for search param_sampling = BayesianParameterSampling({ "learning_rate": uniform(10e-6, 1e0), "num_epochs": choice(10, 20), "batch_size": choice(10, 20, 50, 100, 200, 300, 500, 1000), "hidden_size": choice(300, 400) }) # LOAD ALL SCRIPT PARAMETERS FOR EVERY STEP IN PIPELINE script_params_data_validation = [ '--data_folder_train', dataset_train.as_named_input('train').as_mount(), '--data_folder_test', dataset_test.as_named_input('test').as_mount(), '--local', 'no', '--output_train', train_validated, '--output_test', test_validated, '--data_drift_report', datadriftreport