def create_hyperdrive_trainer(self, estimator, hd_dict, search_type, metric_name, maximize_metric, early_term_policy, max_total_runs, max_concurrent_runs, max_minutes): from azureml.train.hyperdrive import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling if search_type == "random": ps = RandomParameterSampling(hd_dict) elif search_type == "grid": ps = GridParameterSampling(hd_dict) elif search_type == "bayesian": ps = BayesianParameterSampling(hd_dict) else: errors.config_error( "Azure ML Hyperdrive search_type not supported: " + search_type) max_concurrent_runs = min(max_total_runs, max_concurrent_runs) from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal trainer = HyperDriveConfig( estimator=estimator, hyperparameter_sampling=ps, policy=early_term_policy, primary_metric_name=metric_name, primary_metric_goal=PrimaryMetricGoal.MAXIMIZE if maximize_metric else PrimaryMetricGoal.MINIMIZE, max_total_runs=max_total_runs, max_concurrent_runs=max_concurrent_runs, max_duration_minutes=max_minutes) return trainer
def get_parameter_search_hyperdrive_config( self, estimator: Estimator) -> HyperDriveConfig: """ Specify an Azure HyperDrive configuration. Further details are described in the tutorial https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters A reference is provided at https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train .hyperdrive?view=azure-ml-py :param estimator: The estimator (configured PyTorch environment) of the experiment. :return: An Azure HyperDrive run configuration (configured PyTorch environment). """ parameter_space = {'l_rate': uniform(0.0005, 0.01)} param_sampling = RandomParameterSampling(parameter_space) # early terminate poorly performing runs early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10) return HyperDriveConfig( estimator=estimator, hyperparameter_sampling=param_sampling, policy=early_termination_policy, primary_metric_name=TrackedMetrics.Val_Loss.value, primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=2)
def _create_dummy_hyperdrive_param_search_config( estimator: Estimator) -> HyperDriveConfig: return HyperDriveConfig(estimator=estimator, hyperparameter_sampling=RandomParameterSampling( {'l_rate': uniform(0.0005, 0.01)}), primary_metric_name=TrackedMetrics.Val_Loss.value, primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=HYPERDRIVE_TOTAL_RUNS)
def get_hd_config(self, config): hd_config = HyperDriveConfig( run_config=config, hyperparameter_sampling=self.get_param_sampling(), policy=self.get_bandit_policy(), primary_metric_name=PRIMARY_METRIC_NAME, primary_metric_goal=PRIMARY_METRIC_GOAL, max_total_runs=MAX_TOTAL_RUNS, max_concurrent_runs=MAX_CONCURRENT_RUNS) return hd_config
def main(epochs, iterations, compute_target, concurrent_runs): cli_auth = AzureCliAuthentication() experiment = Experiment.from_directory(".", auth=cli_auth) ws = experiment.workspace cluster = ws.compute_targets[compute_target] food_data = ws.datastores['food_images'] script_arguments = {"--data-dir": food_data.as_mount(), "--epochs": epochs} tf_est = TensorFlow(source_directory=".", entry_script='code/train/train.py', script_params=script_arguments, compute_target=cluster, conda_packages=['pillow', 'pandas'], pip_packages=['click', 'seaborn'], use_docker=True, use_gpu=True, framework_version='1.13') # Run on subset of food categories tf_est.run_config.arguments.extend( ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio']) param_sampler = RandomParameterSampling({ '--minibatch-size': choice(16, 32, 64), '--learning-rate': loguniform(-9, -6), '--optimizer': choice('rmsprop', 'adagrad', 'adam') }) # Create Early Termination Policy etpolicy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) # Create HyperDrive Run Configuration hyper_drive_config = HyperDriveConfig( estimator=tf_est, hyperparameter_sampling=param_sampler, policy=etpolicy, primary_metric_name='acc', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=iterations, max_concurrent_runs=concurrent_runs) # Submit the Hyperdrive Run print("Submitting Hyperdrive Run") hd_run = experiment.submit(hyper_drive_config) hd_run.wait_for_completion(raise_on_error=True, show_output=True) print("Finishing Run") best_run = hd_run.get_best_run_by_primary_metric() print(f'##vso[task.setvariable variable=run_id]{best_run.id}')
def get_cross_validation_hyperdrive_config(self, run_config: ScriptRunConfig) -> HyperDriveConfig: """ Returns a configuration for AzureML Hyperdrive that varies the cross validation split index. :param run_config: The AzureML run configuration object that training for an individual model. :return: A hyperdrive configuration object. """ return HyperDriveConfig( run_config=run_config, hyperparameter_sampling=self.get_cross_validation_hyperdrive_sampler(), primary_metric_name=TrackedMetrics.Val_Loss.value, primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=self.get_total_number_of_cross_validation_runs() )
def get_cross_validation_hyperdrive_config( self, estimator: Estimator) -> HyperDriveConfig: """ Returns a configuration for AzureML Hyperdrive that varies the cross validation split index. :param estimator: The AzureML estimator object that runs model training. :return: A hyperdrive configuration object. """ return HyperDriveConfig( estimator=estimator, hyperparameter_sampling=self. get_cross_validation_hyperdrive_sampler(), primary_metric_name=TrackedMetrics.Val_Loss.value, primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=self.get_total_number_of_cross_validation_runs())
def get_cross_validation_hyperdrive_config( self, run_config: ScriptRunConfig) -> HyperDriveConfig: """ Returns a configuration for AzureML Hyperdrive that varies the cross validation split index. Because this adds a val/Loss metric it is important that when subclassing LightningContainer your implementeation of LightningModule logs val/Loss. There is an example of this in HelloRegression's validation_step method. :param run_config: The AzureML run configuration object that training for an individual model. :return: A hyperdrive configuration object. """ return HyperDriveConfig( run_config=run_config, hyperparameter_sampling=GridParameterSampling( parameter_space={ CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: choice(list(range(self.number_of_cross_validation_splits))) }), primary_metric_name=TrackedMetrics.Val_Loss.value, primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=self.number_of_cross_validation_splits)
# Create an estimator that uses the remote compute hyper_estimator = SKLearn( source_directory=experiment_folder, inputs=[diabetes_ds.as_named_input('diabetes') ], # Pass the dataset as an input compute_target=gpu_cluster, conda_packages=['pandas', 'ipykernel', 'matplotlib'], pip_packages=['azureml-sdk', 'argparse', 'pyarrow'], entry_script='diabetes_training.py') # Configure hyperdrive settings hyperdrive = HyperDriveConfig(estimator=hyper_estimator, hyperparameter_sampling=params, policy=policy, primary_metric_name='AUC', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=6, max_concurrent_runs=4) # Run the experiment run = experiment.submit(config=hyperdrive) #Get the best run best_run = run.get_best_run_by_primary_metric() best_run_metrics = best_run.get_metrics() parameter_values = best_run.get_details()['runDefinition']['arguments'] print('Best Run Id: ', best_run.id) print(' -AUC:', best_run_metrics['AUC']) print(' -Accuracy:', best_run_metrics['Accuracy'])
parameter_sampling = utils.get_parameter_sampling( experiment_settings["hyperparameter_sampling"]["method"], experiment_settings["hyperparameter_sampling"]["parameters"]) policy = utils.get_policy( experiment_settings["hyperparameter_sampling"]["policy"]) primary_metric_goal = PrimaryMetricGoal.MAXIMIZE if "max" in experiment_settings[ "hyperparameter_sampling"][ "primary_metric_goal"] else PrimaryMetricGoal.MINIMIZE run_config = HyperDriveConfig( estimator=estimator, hyperparameter_sampling=parameter_sampling, policy=policy, primary_metric_name=experiment_settings["hyperparameter_sampling"] ["primary_metric_name"], primary_metric_goal=primary_metric_goal, max_total_runs=experiment_settings["hyperparameter_sampling"] ["max_total_runs"], max_concurrent_runs=experiment_settings["hyperparameter_sampling"] ["max_concurrent_runs"], max_duration_minutes=experiment_settings["hyperparameter_sampling"] ["max_duration_minutes"]) else: run_config = estimator # Submitting an Experiment and creating a Run print("Submitting an experiment and creating a run") run = exp.submit(run_config, tags=experiment_settings["run_tags"]) # Shows output of the run on stdout run.wait_for_completion(show_output=True, wait_post_processing=True)
hyperdrive.loguniform( convert_base(1e-6), convert_base(5e-2)), # NB. loguniform on [exp(min), exp(max)] "--weight_decay": hyperdrive.uniform(5e-3, 15e-2), "--per_device_train_batch_size": hyperdrive.choice([16, 32]), } hyperparameter_sampling = RandomParameterSampling(search_space) policy = TruncationSelectionPolicy(truncation_percentage=50, evaluation_interval=2, delay_evaluation=0) hyperdrive_config = HyperDriveConfig( run_config=config, hyperparameter_sampling=hyperparameter_sampling, policy=policy, primary_metric_name="eval_matthews_correlation", primary_metric_goal=hyperdrive.PrimaryMetricGoal.MAXIMIZE, max_total_runs=20, max_concurrent_runs=8, ) run = Experiment( ws, "transformers-glue-finetuning-hyperdrive").submit(hyperdrive_config) print(run.get_portal_url()) run.wait_for_completion(show_output=True)
def build_pipeline(dataset, ws, config): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = '.' def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = './scripts' os.makedirs(script_folder, exist_ok=True) shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder) shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder) shutil.copy(os.path.join(base_dir, 'train.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder) shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder) shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder) shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder) shutil.copy(os.path.join(base_dir, 'config.json'), script_folder) cpu_compute_name = config['cpu_compute'] try: cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) except:# ComputeTargetException: print("creating new compute target") provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, idle_seconds_before_scaledown=1800) cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config) cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = config['gpu_compute'] try: gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print("found existing compute target: %s" % gpu_compute_name) except: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=10, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) # use get_status() to get a detailed status for the current cluster. try: print(gpu_compute_target.get_status().serialize()) except BaseException as e: print("Could not get status of compute target.") print(e) # conda dependencies for compute targets cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"]) # Runconfigs cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd) cpu_compute_run_config.environment.docker.enabled = True cpu_compute_run_config.environment.docker.gpu_support = False cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE cpu_compute_run_config.environment.spark.precache_packages = False print("PipelineData object created") # DataReference to where video data is stored. video_data = DataReference( datastore=def_blob_store, data_reference_name="video_data", path_on_datastore=os.path.join("prednet", "data", "video", dataset)) print("DataReference object created") # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1. raw_data = PipelineData("raw_video_fames", datastore=def_blob_store) preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) data_output = PipelineData("output_data", datastore=def_blob_store) # prepare dataset for training/testing prednet video_decoding = PythonScriptStep( name='decode_videos', script_name="video_decoding.py", arguments=["--input_data", video_data, "--output_data", raw_data], inputs=[video_data], outputs=[raw_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) print("video_decode step created") # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name='prepare_data', script_name="data_preparation.py", arguments=["--input_data", raw_data, "--output_data", preprocessed_data], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=cpu_compute_run_config, allow_reuse=True, hash_paths=['.'] ) data_prep.run_after(video_decoding) print("data_prep step created") # configure access to ACR for pulling our custom docker image acr = ContainerRegistry() acr.address = config['acr_address'] acr.username = config['acr_username'] acr.password = config['acr_password'] est = Estimator(source_directory=script_folder, compute_target=gpu_compute_target, entry_script='train.py', use_gpu=True, node_count=1, custom_docker_image = "wopauli_1.8-gpu:1", image_registry_details=acr, user_managed=True ) ps = RandomParameterSampling( { '--batch_size': choice(1, 2, 4, 8), '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"), '--learning_rate': loguniform(-6, -1), '--lr_decay': loguniform(-9, -1), '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), '--transfer_learning': choice("True", "False") } ) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=10, max_concurrent_runs=5, max_duration_minutes=60*6 ) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_run_config=hdc, estimator_entry_script_arguments=[ '--data-folder', preprocessed_data, '--remote_execution', '--dataset', dataset ], inputs=[preprocessed_data], metrics_output = data_metrics, allow_reuse=True ) hd_step.run_after(data_prep) registration_step = PythonScriptStep( name='register_model', script_name='model_registration.py', arguments=['--input_dir', data_metrics, '--output_dir', data_output], compute_target=cpu_compute_target, inputs=[data_metrics], outputs=[data_output], source_directory=script_folder, allow_reuse=True, hash_paths=['.'] ) registration_step.run_after(hd_step) pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step]) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") pipeline_name = 'prednet_' + dataset published_pipeline = pipeline.publish(name=pipeline_name) schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'), polling_interval=1 ) return pipeline_name
uniform(0.5, 1), '--learning_rate': uniform(0.005, 0.25), '--min_data_in_leaf': choice(list(range(2, 501))), '--lambda_l1': choice(list(range(201))), '--lambda_l2': choice(list(range(201))), '--n_estimators': choice(list(range(100, 4001, 100))) }) # Configure hyperdrive settings hyperdrive = HyperDriveConfig(run_config=script_config, hyperparameter_sampling=params, policy=None, primary_metric_name='rmse', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=160, max_concurrent_runs=4) # Run the experiment experiment = Experiment(workspace=ws, name='training_hyperdrive03') run = experiment.submit(config=hyperdrive) print("Experiment is running...") # Show the status in the notebook as the experiment runs # RunDetails(run).show() run.wait_for_completion() print("Experiment has done.")
src = ScriptRunConfig(source_directory="./topicmodel", script='train.py', arguments=args, compute_target=compute_target, environment=env) param_sampling = RandomParameterSampling({ "--num-topics": choice(5, 10, 15, 20) }) # Submit experiment hd = HyperDriveConfig(run_config=src, hyperparameter_sampling=param_sampling, primary_metric_name="c_v", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=100, max_concurrent_runs=4) run = exp.submit(config=hd) run.wait_for_completion(show_output=False) print(run.get_metrics()) print(run.get_file_names()) # Register model best_run = run.get_best_run_by_primary_metric()
'--learning_rate': uniform(1e-3, 2e-2), '--momentum': uniform(.1, .95), '--weight_decay': loguniform(-5, -3), '--temperature': uniform(1, 9), # '--lambda_const': uniform(.1, .3), '--transfer_learning': choice("True", "False") }) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='val_loss', primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=5, #100, max_concurrent_runs=5) hd_step = HyperDriveStep( name="train_w_hyperdrive", hyperdrive_config=hdc, estimator_entry_script_arguments=[ '--data-folder', labeled_data, '--logits-folder', logits_data, '--remote_execution' ], # estimator_entry_script_arguments=script_params, inputs=[labeled_data, logits_data], metrics_output=data_metrics, allow_reuse=True)
framework_version='2.0', use_gpu=True, pip_packages=[ 'transformers==2.0.0', 'azureml-dataprep[fuse,pandas]==1.1.29' ]) # %% [markdown] # Finally, we add all our parameters in a [HyperDriveConfig](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.hyperdriveconfig?view=azure-ml-py) class and submit it as a run. # %% from azureml.train.hyperdrive import HyperDriveConfig hyperdrive_run_config = HyperDriveConfig( estimator=estimator4, hyperparameter_sampling=param_sampling, policy=early_termination_policy, primary_metric_name=primary_metric_name, primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=10, max_concurrent_runs=2) run4 = experiment.submit(hyperdrive_run_config) # %% [markdown] # When we view the details of our run this time, we will see information and metrics for every run in our hyperparameter tuning. # %% from azureml.widgets import RunDetails RunDetails(run4).show() # %% [markdown] # We can retrieve the best run based on our defined metric.
script_params=script_params_3, source_directory=os.path.dirname(os.path.realpath(__file__)), compute_target=workspace.compute_targets["alwaysoncluster"], distributed_training=MpiConfiguration(), framework_version='1.4', use_gpu=True, pip_packages=[ 'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1', 'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0', 'onnxruntime==1.2.0', 'onnx==1.6.0' ]) experiment = Experiment(workspace=workspace, name="deeplearning") run = experiment.submit(estimator) if hyperdrive is True: # Define multi-run configuration hyperdrive_run_config = HyperDriveConfig( estimator=estimator, hyperparameter_sampling=param_sampling, policy=None, primary_metric_name="accuracy", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=10, max_concurrent_runs=None) # Define the ML experiment experiment = Experiment(workspace=workspace, name="hyperdrive") # Submit the experiment run = experiment.submit(hyperdrive_run_config)
'--model_type': model_type, '--max_seq_len': choice(128, 256), '--embeds_dropout': choice(0.1, 0.2, 0.3) }) ## Termination policy early_termination_policy = BanditPolicy(slack_factor=0.1, evaluation_interval=1, delay_evaluation=3) ## Prepare HyperDrive Config hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=param_sampling, policy=early_termination_policy, primary_metric_name='f1macro', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=40, max_concurrent_runs=4) ## Run hyperparameter tuning hyperdrive_run = exp.submit(config=hdc) if args.update_model: hyperdrive_run.wait_for_completion(show_output=True) ## Get Results best_run = hyperdrive_run.get_best_run_by_primary_metric() ## Experiment experiment_name = args.project_name + "-train" exp = Experiment(workspace=ws, name=experiment_name) #Parameters determined by hyperparams script_params_hyper = { '--learning_rate':
from azureml.core import Workspace, Experiment from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal ws = Workspace.from_config() hyperdrive = HyperDriveConfig(run_config=script_config, hyperparameter_sampling=param_sampling, policy=None, primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, primary_metric_name=Accuracy, max_total_runs=6, max_concurrent_runs=4) experiment = Experiment(ws, name="hyper_training") hyperdrive_run = experiment.submit(config=hyperdrive) # monitoring the childs for child_run in run.get_children(): print(child_run.id, child_run.get_metrics()) for child_run in hyperdrive_run.get_children_sorted_by_primary_metric(): print(child_run) best_run = hyperdrive_run.get_best_run_by_primary_metric()
entry_script=PathsConfig.entry_script, use_gpu=True, custom_docker_image=settings["IMAGE_NAME"], ) if GeneralConfig.hyperdrive: if GeneralConfig.architecture_type == "PretrainedResNet50": hyperparams_space = HyperdriveConfig.pretrained_resnet50_hyperparams_space else: raise NotImplementedError hyperparams_space_format = { parameter: choice(parameter_range) for parameter, parameter_range in hyperparams_space.items() } parameters_sampling = RandomParameterSampling(hyperparams_space_format) policy = BanditPolicy( evaluation_interval=HyperdriveConfig.evaluation_interval, slack_factor=HyperdriveConfig.slack_factor, ) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=parameters_sampling, policy=policy, primary_metric_name="Accuracy", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=HyperdriveConfig.max_total_runs, max_concurrent_runs=HyperdriveConfig.max_concurrent_runs, ) run = exp.submit(hdc) else: run = exp.submit(est)
script_params={ '--data-folder': dataset.as_named_input('mnist').as_mount() }, compute_target=compute_target, entry_script='tf_mnist2.py', framework_version='2.0', use_gpu=True, pip_packages=['azureml-dataprep[pandas,fuse]']) policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) htc = HyperDriveConfig(estimator=est, hyperparameter_sampling=ps, policy=policy, primary_metric_name='validation_acc', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=8, max_concurrent_runs=4) htr = exp.submit(config=htc) #RunDetails(htr).show() htr.wait_for_completion(show_output=True) assert (htr.get_status() == "Completed") best_run = htr.get_best_run_by_primary_metric() print(best_run.get_file_names())
compute_taret = cluster ) #creating hyper parmas from azureml.train.hyperdrive import GridParameterSampling, choice hyper_params = GridParameterSampling({ '--n_estimators': choice(10,20,30,100), '--min_samples_leaf': choice(1,2,5) }) #configuring hyperdrive class from azureml.train.hyperdrive import HyperDriveConfig,PrimaryMetricGoal hyper_config = HyperDriveConfig(run_config=script_run, hyperparameter_sampling = hyper_params, policy= None, primary_metric_name = 'accuray', primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, max_total_runs = 20, max_concurrent_runs=2) exp = Experiment(ws, "My_hyperdrive_exp") new_run = exp.submit(script_run) new_run.wait_for_completion(show_output = True)
def create_experiment_config(workspace): ######################################## ### Creating data prep Pipeline Step ### ######################################## # Load settings print("Loading settings") data_prep_step_path = os.path.join("steps", "data_prep") with open(os.path.join(data_prep_step_path, "step.json")) as f: data_prep_settings = json.load(f) # Setup datasets of first step print("Setting up datasets") data_prep_input = Dataset.get_by_name(workspace=workspace, name=data_prep_settings.get( "dataset_input_name", None)).as_named_input( data_prep_settings.get( "dataset_input_name", None)).as_mount() data_prep_output = PipelineData( name=data_prep_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=data_prep_settings.get( "datastore_output_name", "workspaceblobstore")), output_mode="mount").as_dataset() # Uncomment next lines, if you want to register intermediate dataset #data_prep_output.register( # name=data_prep_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") data_prep_dependencies = CondaDependencies.create( pip_packages=data_prep_settings.get("pip_packages", []), conda_packages=data_prep_settings.get("conda_packages", []), python_version=data_prep_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") data_prep_run_config = RunConfiguration( conda_dependencies=data_prep_dependencies, framework=data_prep_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") data_prep_compute_target = ComputeTarget(workspace=workspace, name=data_prep_settings.get( "compute_target_name", None)) # Create python step print("Creating Step") data_prep = PythonScriptStep( name=data_prep_settings.get("step_name", None), script_name=data_prep_settings.get("script_name", None), arguments=data_prep_settings.get("arguments", []), compute_target=data_prep_compute_target, runconfig=data_prep_run_config, inputs=[data_prep_input], outputs=[data_prep_output], params=data_prep_settings.get("parameters", []), source_directory=data_prep_step_path, allow_reuse=data_prep_settings.get("allow_reuse", True), version=data_prep_settings.get("version", None), ) ############################################### ### Creating data model train Pipeline Step ### ############################################### # Load settings print("Loading settings") model_train_step_path = os.path.join("steps", "model_train") with open(os.path.join(model_train_step_path, "step.json")) as f: model_train_settings = json.load(f) hyperparameter_sampling_settings = model_train_settings.get( "hyperparameter_sampling", {}) # Setup datasets of first step print("Setting up datasets") model_train_input = data_prep_output.as_named_input( name=model_train_settings.get("dataset_input_name", None)) model_train_output = PipelineData( name=model_train_settings.get("dataset_output_name", None), datastore=Datastore(workspace=workspace, name=model_train_settings.get( "datastore_output_name", None)), output_mode="mount", ).as_dataset() # Uncomment next lines, if you want to register intermediate dataset #model_train_output.register( # name=model_train_settings.get("dataset_output_name", None), # create_new_version=True #) # Create conda dependencies print("Creating conda dependencies") model_train_dependencies = CondaDependencies.create( pip_packages=model_train_settings.get("pip_packages", []), conda_packages=model_train_settings.get("conda_packages", []), python_version=model_train_settings.get("python_version", "3.6.2")) # Create run configuration print("Creating RunConfiguration") model_train_run_config = RunConfiguration( conda_dependencies=model_train_dependencies, framework=model_train_settings.get("framework", "Python")) # Loading compute target print("Loading ComputeTarget") model_train_compute_target = ComputeTarget(workspace=workspace, name=model_train_settings.get( "compute_target_name", None)) # Create distributed training backend print("Creating distributed training backend") distributed_training_backend = get_distributed_backend( backend_name=model_train_settings.get("distributed_backend", None)) # Create Estimator for Training print("Creating Estimator for training") model_train_estimator = Estimator( source_directory=model_train_step_path, entry_script=model_train_settings.get("script_name", None), environment_variables=model_train_settings.get("parameters", None), compute_target=model_train_compute_target, node_count=model_train_settings.get("node_count", None), distributed_training=distributed_training_backend, conda_packages=model_train_settings.get("conda_packages", None), pip_packages=model_train_settings.get("pip_packages", None), ) try: # Create parameter sampling print("Creating Parameter Sampling") parameter_dict = {} parameters = hyperparameter_sampling_settings.get( "parameters", {}) if "parameters" in hyperparameter_sampling_settings else {} for parameter_name, parameter_details in parameters.items(): parameter_distr = get_parameter_distribution( distribution=parameter_details.get("distribution", None), **parameter_details.get("settings", {})) parameter_dict[f"--{parameter_name}"] = parameter_distr model_train_ps = get_parameter_sampling( sampling_method=hyperparameter_sampling_settings.get( "method", None), parameter_dict=parameter_dict) # Get Policy definition policy_settings = hyperparameter_sampling_settings.get("policy", {}) kwargs = { key: value for key, value in policy_settings.items() if key not in ["policy_method", "evaluation_interval", "delay_evaluation"] } # Create termination policy print("Creating early termination policy") model_train_policy = get_policy( policy_method=policy_settings.get("method", ""), evaluation_interval=policy_settings.get("evaluation_interval", None), delay_evaluation=policy_settings.get("delay_evaluation", None), **kwargs) # Create HyperDriveConfig print("Creating HyperDriveConfig") model_train_hyperdrive_config = HyperDriveConfig( estimator=model_train_estimator, hyperparameter_sampling=model_train_ps, policy=model_train_policy, primary_metric_name=hyperparameter_sampling_settings.get( "primary_metric", None), primary_metric_goal=PrimaryMetricGoal.MINIMIZE if "min" in hyperparameter_sampling_settings.get( "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE, max_total_runs=hyperparameter_sampling_settings.get( "max_total_runs", 1), max_concurrent_runs=hyperparameter_sampling_settings.get( "max_concurrent_runs", 1), max_duration_minutes=hyperparameter_sampling_settings.get( "max_duration_minutes", None)) # Create HyperDriveStep print("Creating HyperDriveStep") model_train = HyperDriveStep( name=model_train_settings.get("step_name", None), hyperdrive_config=model_train_hyperdrive_config, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) except: print("Not all required parameters specified for HyperDrive step") # Create EstimatorStep print("Creating EstimatorStep") model_train = EstimatorStep( name=model_train_settings.get("step_name", None), estimator=model_train_estimator, estimator_entry_script_arguments=model_train_settings.get( "arguments", None), inputs=[model_train_input], outputs=[model_train_output], compute_target=model_train_compute_target, allow_reuse=model_train_settings.get("allow_reuse", True), version=model_train_settings.get("version", True)) ######################### ### Creating Pipeline ### ######################### # Create Pipeline print("Creating Pipeline") pipeline = Pipeline( workspace=workspace, steps=[model_train], description="Training Pipeline", ) # Validate pipeline print("Validating pipeline") pipeline.validate() return pipeline
def build_prednet_pipeline(dataset, ws): print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name)) base_dir = "." def_blob_store = ws.get_default_datastore() # folder for scripts that need to be uploaded to Aml compute target script_folder = "./scripts" os.makedirs(script_folder) shutil.copytree(os.path.join(base_dir, "models"), os.path.join(base_dir, script_folder, "models")) shutil.copy(os.path.join(base_dir, "train.py"), script_folder) shutil.copy(os.path.join(base_dir, "data_preparation.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_prednet.py"), script_folder) shutil.copy(os.path.join(base_dir, "batch_scoring.py"), script_folder) shutil.copy(os.path.join(base_dir, "train_clf.py"), script_folder) shutil.copy(os.path.join(base_dir, "register_clf.py"), script_folder) cpu_compute_name = args.cpu_compute_name cpu_compute_target = AmlCompute(ws, cpu_compute_name) print("found existing compute target: %s" % cpu_compute_name) # use get_status() to get a detailed status for the current cluster. print(cpu_compute_target.get_status().serialize()) # choose a name for your cluster gpu_compute_name = args.gpu_compute_name gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name) print(gpu_compute_target.get_status().serialize()) env = Environment.get(ws, "prednet") # Runconfigs runconfig = RunConfiguration() runconfig.environment = env print("PipelineData object created") # DataReference to where raw data is stored. raw_data = DataReference( datastore=def_blob_store, data_reference_name="raw_data", path_on_datastore=os.path.join("prednet", "data", "raw_data"), ) print("DataReference object created") # Naming the intermediate data as processed_data and assigning it to the # variable processed_data. preprocessed_data = PipelineData("preprocessed_data", datastore=def_blob_store) data_metrics = PipelineData("data_metrics", datastore=def_blob_store) hd_child_cwd = PipelineData("prednet_model_path", datastore=def_blob_store) # prednet_path = PipelineData("outputs", datastore=def_blob_store) scored_data = PipelineData("scored_data", datastore=def_blob_store) model_path = PipelineData("model_path", datastore=def_blob_store) # prepare dataset for training/testing recurrent neural network data_prep = PythonScriptStep( name="prepare_data", script_name="data_preparation.py", arguments=[ "--raw_data", raw_data, "--preprocessed_data", preprocessed_data, "--dataset", dataset, ], inputs=[raw_data], outputs=[preprocessed_data], compute_target=cpu_compute_target, source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) # data_prep.run_after(video_decoding) print("data_prep step created") est = Estimator( source_directory=script_folder, compute_target=gpu_compute_target, entry_script="train.py", node_count=1, environment_definition=env, ) ps = BayesianParameterSampling({ "--batch_size": choice(1, 2, 4, 10), "--filter_sizes": choice("3, 3, 3", "4, 4, 4", "5, 5, 5"), "--stack_sizes": choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), "--learning_rate": uniform(1e-6, 1e-3), "--lr_decay": uniform(1e-9, 1e-2), "--freeze_layers": choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"), # "--fine_tuning": choice("True", "False"), }) hdc = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, primary_metric_name="val_loss", primary_metric_goal=PrimaryMetricGoal.MINIMIZE, max_total_runs=3, max_concurrent_runs=3, max_duration_minutes=60 * 6, ) train_prednet = HyperDriveStep( "train_w_hyperdrive", hdc, estimator_entry_script_arguments=[ "--preprocessed_data", preprocessed_data, "--remote_execution", "--dataset", dataset, ], inputs=[preprocessed_data], outputs=[hd_child_cwd], metrics_output=data_metrics, allow_reuse=True, ) train_prednet.run_after(data_prep) register_prednet = PythonScriptStep( name="register_prednet", script_name="register_prednet.py", arguments=[ "--data_metrics", data_metrics, ], compute_target=cpu_compute_target, inputs=[data_metrics, hd_child_cwd], source_directory=script_folder, allow_reuse=True, ) register_prednet.run_after(train_prednet) batch_scoring = PythonScriptStep( name="batch_scoring", script_name="batch_scoring.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--dataset", dataset, # "--prednet_path", # prednet_path ], compute_target=gpu_compute_target, inputs=[preprocessed_data], outputs=[scored_data], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) batch_scoring.run_after(register_prednet) train_clf = PythonScriptStep( name="train_clf", script_name="train_clf.py", arguments=[ "--preprocessed_data", preprocessed_data, "--scored_data", scored_data, "--model_path", model_path ], compute_target=cpu_compute_target, inputs=[preprocessed_data, scored_data], outputs=[model_path], source_directory=script_folder, runconfig=runconfig, allow_reuse=True, ) train_clf.run_after(batch_scoring) register_clf = PythonScriptStep( name="register_clf", script_name="register_clf.py", arguments=["--model_path", model_path], inputs=[model_path], compute_target=cpu_compute_target, source_directory=script_folder, allow_reuse=True, runconfig=runconfig, ) register_clf.run_after(train_clf) pipeline = Pipeline( workspace=ws, steps=[ data_prep, train_prednet, register_prednet, batch_scoring, train_clf, register_clf, ], ) pipeline.validate() pipeline_name = "prednet_" + dataset published_pipeline = pipeline.publish(name=pipeline_name) _ = Schedule.create( workspace=ws, name=pipeline_name + "_sch", pipeline_id=published_pipeline.id, experiment_name=pipeline_name, datastore=def_blob_store, wait_for_provisioning=True, description="Datastore scheduler for Pipeline" + pipeline_name, path_on_datastore=os.path.join("prednet/data/raw_data", dataset, "Train"), polling_interval=60 * 24, ) published_pipeline.submit(ws, pipeline_name)