def run_azure_pytorch(): compute_target = setup_azure_gpu() experiment_name = 'pytorch' exp = Experiment(workspace=ws, name=experiment_name) ds = ws.get_default_datastore() print(ds.datastore_type, ds.account_name, ds.container_name) # ds.upload(src_dir='./data', target_path='mnist', overwrite=True, show_progress=True) script_params = {'--data_dir': ds} pt_est = PyTorch(source_directory='./train-scripts', script_params=script_params, compute_target=compute_target, entry_script='train-pytorch.py', use_gpu=True) run = exp.submit(pt_est) run run.wait_for_completion(show_output=True) print(run.get_metrics()) print(run.get_file_names()) # register model model = run.register_model(model_name=experiment_name, model_path='outputs/pytorch_model.pt') print(model.name, model.id, model.version, sep='\t') compute_target.delete()
def GetEstimator(environmentInfo, inpData): ''' Defines estimator for AML experiment. Method signature defined by AP.Data. Args: environmentInfo: workspace: The workspace with the correct svc that the run will be submitted to. This gives you access to the default datastore, keyvault, container registry. datastore: Datastore where the data is located in compute : compute cluster the run should target inputData: dataDir: dataset directory dataset_name: name of dataset training_config: path of training configuration file toolsDir: directory with auxialiary tools scriptConfig: path of additional configuration for this method sourceDir: directory with experiment code ''' conda_packages = None pip_packages = None # authenticated workspace. You can get access to the default key vault from here workspace = environmentInfo.workspace # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.keyvault(class)?view=azure-ml-py keyvault = workspace.get_default_keyvault() ds = environmentInfo.datastore script_params = { "--data_dir": inpData.dataDir, "--training_config": json.dumps(read_from_json(inpData.training_config)) } print(f'using script_params {script_params}') estimatorConfig = read_from_json(inpData.scriptConfig) conda_packages = estimatorConfig["conda_packages"] pip_packages = estimatorConfig["pip_packages"] print( f'got {conda_packages} and {pip_packages} from config for conda and pip packages' ) # default if json not past in or not set. if conda_packages is None: conda_packages = ["numpy", "pillow"] if pip_packages is None: pip_packages = [ "facenet-pytorch", "torch===1.4.0", "torchvision===0.5.0" ] return PyTorch(source_directory=inpData.sourceDir, script_params=script_params, compute_target=environmentInfo.compute, entry_script='train_model_pytorch.py', use_gpu=True, source_directory_data_store=environmentInfo.datastore, conda_packages=conda_packages, pip_packages=pip_packages)
def train_step(train_dir, valid_dir, compute_target): ''' This step will fine-tune a RESNET-18 model on our dataset using PyTorch. It will use the corresponding input image directories as training and validation data. :param train_dir: The reference to the directory containing the training data :type train_dir: DataReference :param valid_dir: The reference to the directory containing the validation data :type valid_dir: DataReference :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The training step, step outputs dictionary (keys: model_dir) :rtype: EstimatorStep, dict ''' num_epochs = PipelineParameter(name='num_epochs', default_value=25) batch_size = PipelineParameter(name='batch_size', default_value=16) learning_rate = PipelineParameter(name='learning_rate', default_value=0.001) momentum = PipelineParameter(name='momentum', default_value=0.9) model_dir = PipelineData( name='model_dir', pipeline_output_name='model_dir', datastore=train_dir.datastore, output_mode='mount', is_directory=True) outputs = [model_dir] outputs_map = { 'model_dir': model_dir } estimator = PyTorch( source_directory=os.path.dirname(os.path.abspath(__file__)), entry_script='train.py', framework_version='1.3', compute_target=compute_target, use_gpu=True) step = EstimatorStep( name="Train Model", estimator=estimator, estimator_entry_script_arguments=[ '--train_dir', train_dir, '--valid_dir', valid_dir, '--output_dir', model_dir, '--num_epochs', num_epochs, '--batch_size', batch_size, '--learning_rate', learning_rate, '--momentum', momentum ], inputs=[train_dir, valid_dir], compute_target=compute_target, outputs=outputs, allow_reuse=False) return step, outputs_map
def main(args, ws): compute = ws.compute_targets[args.cluster] print(compute.get_status().serialize()) experiment = Experiment(ws, name='pytorch-distributed-horovod') estimator = PyTorch(source_directory=os.path.dirname( os.path.abspath(__file__)), compute_target=compute, entry_script='train.py', node_count=args.nodes, distributed_training=Mpi(), use_gpu=True) run = experiment.submit(estimator) print(run)
def pytorch_version_from_conda_dependencies(conda_dependencies: CondaDependencies) -> Optional[str]: """ Given a CondaDependencies object, look for a spec of the form "pytorch=...", and return whichever supported version is compatible with the value, or None if there isn't one. """ supported_versions = PyTorch.get_supported_versions() for spec in conda_dependencies.conda_packages: components = spec.split("=") if len(components) == 2 and components[0] == "pytorch": version = components[1] for supported in supported_versions: if version.startswith(supported) or supported.startswith(version): return supported return None
def evaluate_step(model_dir, test_dir, compute_target): ''' This step evaluates the trained model on the testing data and outputs the accuracy. :param model_dir: The reference to the directory containing the trained model :type model_dir: DataReference :param test_dir: The reference to the directory containing the testing data :type test_dir: DataReference :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The evaluate step, step outputs dictionary (keys: accuracy_file) :rtype: EstimatorStep, dict ''' accuracy_file = PipelineData( name='accuracy_file', pipeline_output_name='accuracy_file', datastore=test_dir.datastore, output_mode='mount', is_directory=False) outputs = [accuracy_file] outputs_map = { 'accuracy_file': accuracy_file } estimator = PyTorch( source_directory=os.path.dirname(os.path.abspath(__file__)), entry_script='evaluate.py', framework_version='1.3', compute_target=compute_target, use_gpu=True) step = EstimatorStep( name="Evaluate Model", estimator=estimator, estimator_entry_script_arguments=[ '--test_dir', test_dir, '--model_dir', model_dir, '--accuracy_file', accuracy_file ], inputs=[model_dir, test_dir], outputs=outputs, compute_target=compute_target, allow_reuse=True) return step, outputs_map
# %% project_folder = './pytorch-mnist' os.makedirs(project_folder, exist_ok=True) # %% import shutil shutil.copy('mnist.py', project_folder) # %% from azureml.train.dnn import PyTorch estimator = PyTorch(source_directory=project_folder, script_params={'--output-dir': './outputs'}, compute_target=compute_target, entry_script='mnist.py', use_gpu=False) estimator.conda_dependencies.remove_conda_package('pytorch=0.4.0') estimator.conda_dependencies.add_conda_package('pytorch-nightly') estimator.conda_dependencies.add_channel('pytorch') # %% run = exp.submit(estimator) run.wait_for_completion(show_output=True) # %% run.get_file_names() model_path = os.path.join('outputs', 'mnist.onnx') run.download_file(model_path, output_file_path=model_path)
def my_azure_app(cfg: DictConfig) -> None: print(cfg.pretty()) args_dict = OmegaConf.to_container(cfg, resolve=False) yaml_file_nm = args_dict["yaml_file"].split("/")[-1].split(".")[0] conf_file = os.path.join( args_dict["root_path"], yaml_file_nm + "_" + str(datetime.datetime.now()) + ".json", ) print(conf_file) with open(conf_file, "w") as out: out.write(json.dumps(args_dict)) # First, list the supported VM families for Azure Machine Learning Compute # ws = Workspace.get('experiments') cluster_name = "gpucluster" experiment_name = args_dict["experiment_name"] + "_azure" disable_gpu = args_dict["disable_gpu"] script_folder = "." # todo. this is overriden by hydra script_folder = (hydra.utils.get_original_cwd() ) # todo. this is overriden by hydra data_path = os.path.join(args_dict["root_path"], args_dict["data_subdir"]) sub_id = os.getenv("AZ_SUBS_ID") assert sub_id is not None # Edit a run configuration property on the fly. run_local = RunConfiguration() run_local.environment.python.user_managed_dependencies = True ws = Workspace.get( name="experiments", subscription_id=sub_id, resource_group="default_resource_group", ) # print(AmlCompute.supported_vmsizes(workspace=ws)) # Create a new runconfig object _ = RunConfiguration() # Signal that you want to use AmlCompute to execute the script # run_temp_compute.target = "amlcompute" # AmlCompute is created in the same region as your workspace # Set the VM size for AmlCompute from the list of supported_vmsizes try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print("Found existing compute target") except ComputeTargetException: print("Creating a new compute target...") compute_config = AmlCompute.provisioning_configuration( vm_size=args_dict["vm_size"], max_nodes=1) compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=10) s = ws.get_default_datastore() # A reference to the root_path in azure after uplaoding _ = s.upload( src_dir=data_path, target_path=data_path, overwrite=False, show_progress=True, ) # All path except file_name # script_target_path = "/".join(args_dict['yaml_file'].split("/")[:-1]) script_target_path = "/".join( conf_file.split("/")[:-1]) # All path except file_name print(script_target_path) # script_fname = args.config_file.split("/")[-1] script_fname = conf_file.split("/")[-1] print(script_fname) print("---" * 100) azure_script_path = s.upload_files( files=[conf_file], target_path=script_target_path, overwrite=True, show_progress=True, ) print(azure_script_path) azure_script_abs_path = DataReference(datastore=s, data_reference_name="input_data", path_on_datastore=conf_file) azure_root_path = DataReference( datastore=s, data_reference_name="root_data", path_on_datastore=args_dict["root_path"], ) exp = Experiment(workspace=ws, name=experiment_name) # src = ScriptRunConfig(source_directory = script_folder, # script = 'run.py', arguments=['--config_file', 'local/pairs.json'], # run_config = run_temp_compute) # Using pytorch estimator - proper way to submit pytorch jobs script_params = { "--config_file": azure_script_abs_path, "--root_path": azure_root_path, "--experiment_name": experiment_name, } print("GPU Disabled: {}".format(disable_gpu)) estimator = PyTorch( source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script="run.py", use_gpu=not disable_gpu, pip_packages=["pillow==5.4.1"], ) # you can name this as run _ = exp.submit(estimator)
import azureml.core from azureml.core import Workspace from azureml.core.run import Run from azureml.core.experiment import Experiment from azureml.train.dnn import PyTorch subscription_id = "" # The ID of the Azure Subscription resource_group = "AdvanceAnalytics.Aml.Experiments" # Name of a logical resource group workspace_name = "aa-ml-aml-workspace" # The name of the workspace to look for or to create workspace_region = 'eastus' # Location of the workspace computetarget_vm= 'Standard_NC6' # Size of the VM to use experiment_name = 'azureml-gpubenchmark' train_script = 'train_and_track.py' ws = Workspace.create( name = workspace_name, subscription_id = subscription_id, resource_group = resource_group, location = workspace_region, exist_ok = True) src = PyTorch(source_directory = r'.\fastai', compute_target='amlcompute', vm_size=computetarget_vm, entry_script = train_script, use_gpu = True, pip_packages = ['fastai', "azureml-sdk"]) experiment = Experiment(workspace=ws, name=experiment_name) run = experiment.submit(src) run.wait_for_completion(show_output = True)
def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using an Estimator, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) # Add datasets datasets = list() if(input_datasets is not None): for ds in input_datasets: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds)) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds)) # as mount - as download constructor_parameters = { 'source_directory':training_name, 'script_params':script_parameters, 'inputs':datasets, 'compute_target':compute_target, 'entry_script':'train.py', 'pip_requirements_file':'requirements.txt', 'use_gpu':gpu_compute, 'use_docker':True} print('Creating estimator of type', estimator_type) if(estimator_type is None): # Using default Estimator estimator = Estimator(**constructor_parameters) elif(estimator_type == 'tensorflow'): from azureml.train.dnn import TensorFlow version_par = 'framework_version' if(not version_par in constructor_parameters.keys()): print('Defaulting to version 2.0 for TensorFlow') constructor_parameters[version_par] = '2.0' estimator = TensorFlow(**constructor_parameters) elif(estimator_type == 'sklearn'): from azureml.train.sklearn import SKLearn estimator = SKLearn(**constructor_parameters) elif(estimator_type == 'pytorch'): from azureml.train.dnn import PyTorch estimator = PyTorch(**constructor_parameters) # Submit training self.__current_run = self.__experiment.submit(estimator)
arguments=["--model_name_or_path", model_name_param, "--max_seq_length", max_seq_len_param], outputs=[prepared_dataset], source_directory=prep_project_folder, compute_target=compute_target, runconfig=run_config, allow_reuse=True, ) estimator = PyTorch( source_directory=train_project_folder, compute_target=compute_target, entry_script=train_script_name, use_gpu=True, pip_packages=[ "azureml-sdk", "nlp==0.2.0", "pytorch-lightning==0.8.0rc4", "transformers==2.11.0", "pandas", "scipy", "scikit-learn", ], framework_version="1.5", ) train_step = EstimatorStep( name="Training Step", estimator=estimator, estimator_entry_script_arguments=[ "--model_name_or_path", model_name_param, "--task",
if args.reset: script_params['--reset'] = '' if args.sink: script_params['--sink'] = '' shared_memory_size = '8g' if args.shared_memory_size: shared_memory_size = args.shared_memory_size cluster = ComputeTarget(workspace=ws, name=args.cluster_name) run_config.target = cluster project_dir = './pytorch' experiment_name = 'gc_' + name experiment = Experiment(ws, name=experiment_name) src = PyTorch(source_directory=project_dir, script_params=script_params, compute_target=cluster, entry_script='main.py', use_gpu=True, shm_size=shared_memory_size, pip_packages=['numpy==1.17.0', 'Pillow==6.1.0', 'scipy==1.3.0']) run = experiment.submit(src) if args.show_output: run.wait_for_completion(args.show_output)
"num_epochs": choice(1, 2), "batch_size": choice(10, 20, 50, 100, 200, 300, 500, 1000), "hidden_size": choice(300, 400) }) # Define Run Configuration estimator = PyTorch( entry_script='train.py', source_directory=os.path.join(os.path.dirname(os.path.realpath(__file__)), '../', 'modeling'), compute_target=compute_target_hyper, distributed_training=MpiConfiguration(), framework_version='1.4', use_gpu=True, pip_packages=[ 'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1', 'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0', 'onnxruntime==1.2.0', 'onnx==1.6.0' ]) # Define the pipeline step hypertuning = HyperDriveStep( name='hypertrain', hyperdrive_config=HyperDriveConfig( estimator=estimator, hyperparameter_sampling=param_sampling, policy=None, primary_metric_name="accuracy",
## Training Step ## # train.py does the training based on the processed data # # Output location for the produced model model = PipelineData(name="model", datastore=ds, output_path_on_compute="model") # Estimator script params estimator_script_params = [ "--data-folder", training_data_location, "--output-folder", model ] # Create the tensorflow Estimator trainEstimator = PyTorch( source_directory = script_folder, compute_target = cluster, entry_script = "steps/train.py", use_gpu = True, framework_version='1.3' ) # Create a pipeline step with the TensorFlow Estimator trainOnGpuStep = EstimatorStep( name='Train Estimator Step', estimator=trainEstimator, inputs=[training_data_location], outputs=[model], compute_target=cluster, estimator_entry_script_arguments = estimator_script_params ) ## Register Model Step ##
"--exp_name": workdir.split('/')[-1], } def make_container_registry(address, username, password): cr = ContainerRegistry() cr.address = address cr.username = username cr.password = password return cr estimator = PyTorch(source_directory='./', script_params=script_params, compute_target=ct, use_gpu=True, shm_size='256G', # image_registry_details= my_registry, entry_script=entry_script, custom_docker_image=custom_docker_image, user_managed=True, ) if myargs.itp > 0: cmk8sconfig = K8sComputeConfiguration() cmk8s = dict() cmk8s['gpu_count'] = myargs.card cmk8sconfig.configuration = cmk8s estimator.run_config.cmk8scompute = cmk8sconfig
# define script parameters script_params_3 = { '--models': models, '--data_folder_train': dataset_train.as_named_input('train').as_mount(), '--data_folder_test': dataset_test.as_named_input('test').as_mount(), '--local': 'no' } estimator = PyTorch( entry_script='train.py', script_params=script_params_3, source_directory=os.path.dirname(os.path.realpath(__file__)), compute_target=workspace.compute_targets["alwaysoncluster"], distributed_training=MpiConfiguration(), framework_version='1.4', use_gpu=True, pip_packages=[ 'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1', 'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0', 'onnxruntime==1.2.0', 'onnx==1.6.0' ]) experiment = Experiment(workspace=workspace, name="deeplearning") run = experiment.submit(estimator) if hyperdrive is True: # Define multi-run configuration hyperdrive_run_config = HyperDriveConfig( estimator=estimator, hyperparameter_sampling=param_sampling,
compute_target, compute_target_created = get_compute_target( workspace, "lowpriority") dataset = Dataset.get_by_name(workspace=workspace, name=args.dataset_name) data_directory = dataset.as_mount() experiment = Experiment(workspace, name=args.experiment_name) script_params = { "--action": "final_layer", "--epochs": args.epochs, "--learning-rate": args.learning_rate, "--gamma": args.gamma, "--momentum": args.momentum, "--step-size": args.step_size, "--environment": "azure", "--model-dir": "./outputs", "--data-dir": data_directory, } estimator = PyTorch( source_directory="hymenoptera", script_params=script_params, compute_target=compute_target, entry_script="train.py", use_gpu=True, pip_packages=["azureml-dataprep[pandas,fuse]", "azureml-mlflow"], ) run = experiment.submit(estimator) run.wait_for_completion(show_output=True) if compute_target_created: print("Deleting compute target") compute_target.delete()
ws = Workspace.from_config() print("Workspace details:") print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t') script_folder = os.getcwd() # Create PyTorch experiment compute_name = "gpu-nc6-1" if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: print('Found compute target: ' + compute_name) else: print("compute target not found") # Create experiment experiment_name = 'my_experiment' exp = Experiment(workspace=ws, name=experiment_name) script_params = {} pt_est = PyTorch(source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script='pytorch_net.py', use_gpu=True) # Submit PyTorch experiment run = exp.submit(pt_est) run.wait_for_completion(show_output=True)
experiment_name = "pytorch-mnist" exp = Experiment(workspace=ws, name=experiment_name) compute = get_or_create_compute(ws) prepared_data = MNISTPrepareData(root="./data") ds = ws.get_default_datastore() # ds.upload(src_dir=prepared_data.processed_folder, target_path='mnist_pytorch', overwrite=True, show_progress=False) script_params = {'--data-folder': ds.as_mount()} script_folder = './scripts' est = PyTorch(source_directory=script_folder, script_params=script_params, compute_target=compute, entry_script='train.py') run = exp.submit(config=est) status = run.get_status() while status != "Completed": if status in ["Failed", "Canceled"]: print('Run failed or cancelled') break else: print('Still running, Sleeping for a min and checking again...') time.sleep(60) status = run.get_status() print(run.get_metrics())
def main(req: func.HttpRequest) -> (func.HttpResponse): logging.info('Python HTTP trigger function processed a request.') # For now this can be a POST where we have <base url>/api/HttpTrigger?start=<any string> image_url = req.params.get('start') logging.info(type(image_url)) # Use service principal secrets to create authentication vehicle and # define workspace object try: svc_pr = ServicePrincipalAuthentication( tenant_id=os.getenv('TENANT_ID', ''), service_principal_id=os.getenv('APP_ID', ''), service_principal_password=os.getenv('PRINCIPAL_PASSWORD', '')) ws = Workspace(subscription_id=os.getenv('AZURE_SUB', ''), resource_group=os.getenv('RESOURCE_GROUP', ''), workspace_name=os.getenv('WORKSPACE_NAME',''), auth=svc_pr) print("Found workspace {} at location {} using Azure CLI \ authentication".format(ws.name, ws.location)) # Usually because authentication didn't work except ProjectSystemException as err: print('Authentication did not work.') return json.dumps('ProjectSystemException') # Need to create the workspace except Exception as err: ws = Workspace.create(name=os.getenv('WORKSPACE_NAME', ''), subscription_id=os.getenv('AZURE_SUB', ''), resource_group=os.getenv('RESOURCE_GROUP', ''), create_resource_group=True, location='westus', # Or other supported Azure region auth=svc_pr) print("Created workspace {} at location {}".format(ws.name, ws.location)) # choose a name for your cluster - under 16 characters cluster_name = "gpuforpytorch" try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target.') except ComputeTargetException: print('Creating a new compute target...') # AML Compute config - if max_nodes are set, it becomes persistent storage that scales compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', min_nodes=0, max_nodes=2) # create the cluster compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True) # use get_status() to get a detailed status for the current cluster. # print(compute_target.get_status().serialize()) # # Create a project directory and copy training script to ii project_folder = os.path.join(os.getcwd(), 'HttpTrigger', 'project') # os.makedirs(project_folder, exist_ok=True) # shutil.copy(os.path.join(os.getcwd(), 'HttpTrigger', 'pytorch_train.py'), project_folder) # Create an experiment experiment_name = 'fish-no-fish' experiment = Experiment(ws, name=experiment_name) # Use an AML Data Store for training data ds = Datastore.register_azure_blob_container(workspace=ws, datastore_name='funcdefaultdatastore', container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''), account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''), create_if_not_exists=True) # Use an AML Data Store to save models back up to ds_models = Datastore.register_azure_blob_container(workspace=ws, datastore_name='modelsdatastorage', container_name=os.getenv('STORAGE_CONTAINER_NAME_MODELS', ''), account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''), create_if_not_exists=True) # Set up for training ("trans" flag means - use transfer learning and # this should download a model on compute) # Using /tmp to store model and info due to the fact that # creating new folders and files on the Azure Function host # will trigger the function to restart. script_params = { '--data_dir': ds.as_mount(), '--num_epochs': 30, '--learning_rate': 0.01, '--output_dir': '/tmp/outputs', '--trans': 'True' } # Instantiate PyTorch estimator with upload of final model to # a specified blob storage container (this can be anything) estimator = PyTorch(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='pytorch_train.py', use_gpu=True, inputs=[ds_models.as_upload(path_on_compute='./outputs/model_finetuned.pth')]) run = experiment.submit(estimator) print(run.get_details()) # # The following would certainly be blocking, but that's ok for debugging # while run.get_status() not in ['Completed', 'Failed']: # For example purposes only, not exhaustive # print('Run {} not in terminal state'.format(run.id)) # time.sleep(10) return json.dumps(run.get_status())
def main(req: func.HttpRequest) -> (func.HttpResponse): logging.info('Python HTTP trigger function processed a request.') # For now this can be a POST where we have <base url>/api/HttpTrigger?start=<any string> image_url = req.params.get('start') logging.info(type(image_url)) # Write a config.json (fill in template values with system vars) config_temp = { 'subscription_id': os.getenv('AZURE_SUB', ''), 'resource_group': os.getenv('RESOURCE_GROUP', ''), 'workspace_name': os.getenv('WORKSPACE_NAME', '') } with open(os.path.join(os.getcwd(), 'HttpTrigger', 'config.json'), 'w') as f: json.dump(config_temp, f) # Get the workspace from config.json try: ws = Workspace.from_config( os.path.join(os.getcwd(), 'HttpTrigger', 'config.json')) # Authentication didn't work except ProjectSystemException as err: return json.dumps('ProjectSystemException') # Need to create the workspace except Exception as err: ws = Workspace.create( name=os.getenv('WORKSPACE_NAME', ''), subscription_id=os.getenv('AZURE_SUB', ''), resource_group=os.getenv('RESOURCE_GROUP', ''), create_resource_group=True, location='eastus2' # Or other supported Azure region ) # choose a name for your cluster cluster_name = "gpuclusterplease" try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target.') except ComputeTargetException: print('Creating a new compute target...') # AML Compute config - if max_nodes are set, it becomes persistent storage that scales compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', max_nodes=4) # create the cluster compute_target = ComputeTarget.create(ws, cluster_name, compute_config) compute_target.wait_for_completion(show_output=True) # use get_status() to get a detailed status for the current cluster. print(compute_target.get_status().serialize()) # Create a project directory and copy training script to ii project_folder = os.path.join(os.getcwd(), 'HttpTrigger', 'project') os.makedirs(project_folder, exist_ok=True) shutil.copy(os.path.join(os.getcwd(), 'HttpTrigger', 'pytorch_train.py'), project_folder) # Create an experiment experiment_name = 'fish-no-fish' experiment = Experiment(ws, name=experiment_name) # Use an AML Data Store for training data ds = Datastore.register_azure_blob_container( workspace=ws, datastore_name='funcdefaultdatastore', container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''), account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''), create_if_not_exists=True) # Use an AML Data Store to save models back up to ds_models = Datastore.register_azure_blob_container( workspace=ws, datastore_name='modelsdatastorage', container_name=os.getenv('STORAGE_CONTAINER_NAME_MODELS', ''), account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''), create_if_not_exists=True) # Set up for training ("trans" flag means - use transfer learning and # this should download a model on compute) script_params = { '--data_dir': ds.as_mount(), '--num_epochs': 30, '--learning_rate': 0.01, '--output_dir': './outputs', '--trans': 'True' } # Instantiate PyTorch estimator with upload of final model to # a specified blob storage container (this can be anything) estimator = PyTorch( source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='pytorch_train.py', use_gpu=True, inputs=[ ds_models.as_upload( path_on_compute='./outputs/model_finetuned.pth') ]) run = experiment.submit(estimator) run.wait_for_completion(show_output=True) return json.dumps('Job complete')
experiment = Experiment(ws, name=experiment_name) # Create a PyTorch estimator # The Azure ML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer here. The following code will define a single-node PyTorch job. from azureml.train.dnn import PyTorch script_params = { '--num_epochs': 30, '--output_dir': './outputs' } estimator = PyTorch(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='pytorch_train.py', use_gpu=True) # Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities. # Start a hyperparameter sweep # First, we will define the hyperparameter space to sweep over. Since our training script uses a learning rate schedule to decay the learning rate every several epochs, let's tune the initial learning rate and the momentum parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (best_val_acc). # Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the BanditPolicy, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our best_val_acc metric every epoch and evaluation_interval=1). Notice we will delay the first policy evaluation until after the first 10 epochs (delay_evaluation=10). Refer here for more information on the BanditPolicy and other policies available. from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveRunConfig, BanditPolicy, PrimaryMetricGoal, uniform param_sampling = RandomParameterSampling( { 'learning_rate': uniform(0.0005, 0.005), 'momentum': uniform(0.9, 0.99)
if args.do_train: logging.warning(f'[INFO] Running train for {args.project_name}') for task in tasks: exp = Experiment(workspace=ws, name=f'{args.project_name}_train_{task}') config = tasks.get(task) if config.get('type') == 'classification': script_params = { '--task': int(task), '--use_cuda': '', '--register_model': '' } est = PyTorch(source_directory=script_folder, compute_target=compute_target, script_params=script_params, entry_script='src/classification.py', pip_packages=pip_packages, use_gpu=True) ### Hyperparameters params if language == 'en': model_type = choice('roberta', 'bert', 'albert') elif language == 'de': model_type = choice('distilbert', 'bert', 'roberta') elif language == 'it' or language == 'es': model_type = choice('bert') elif language == 'fr': model_type = choice('camembert', 'bert') param_sampling = RandomParameterSampling({ '--n_epochs': choice(3, 5, 10),
enable_optimized_mode = experiment_settings["framework"]["pytorch"][ "_enable_optimized_mode"] estimator = PyTorch( source_directory=experiment_settings["source_directory"], compute_target=compute_target, entry_script=experiment_settings["entry_script"], script_params=experiment_settings["script_parameters"], node_count=experiment_settings["distributed_training"]["node_count"], distributed_training=distrib_training_backend, use_docker=experiment_settings["docker"]["use_docker"], custom_docker_image=experiment_settings["docker"]["custom_image"], image_registry_details=container_registry, user_managed=experiment_settings["user_managed"], conda_packages=experiment_settings["dependencies"]["conda_packages"], pip_packages=experiment_settings["dependencies"]["pip_packages"], conda_dependencies_file=experiment_settings["dependencies"] ["conda_dependencies_file"], pip_requirements_file=experiment_settings["dependencies"] ["pip_requirements_file"], environment_variables=experiment_settings["environment_variables"], inputs=experiment_settings["data_references"], source_directory_data_store=experiment_settings[ "source_directory_datastore"], shm_size=experiment_settings["docker"]["shm_size"], max_run_duration_seconds=experiment_settings[ "max_run_duration_seconds"], framework_version=framework_version, _enable_optimized_mode=enable_optimized_mode) elif experiment_settings["framework"]["name"] == "tensorflow": framework_version = experiment_settings["framework"]["tensorflow"][
preprocessing_step = EstimatorStep( name="Preprocessing_Train", estimator=preprocessing_est, estimator_entry_script_arguments=[ "--data_dir", input_data, "--output_data_dir", output ], inputs=[input_data], outputs=[output], compute_target=cpu_cluster, allow_reuse=True, ) pytorch_est = PyTorch( source_directory='020-ann', compute_target=cpu_cluster, entry_script='pytorch_train.py', use_gpu=False, framework_version='1.1', conda_packages=['pandas'], ) pytorch_step = EstimatorStep( name="PyTorch_Train", estimator=pytorch_est, estimator_entry_script_arguments=["--data_dir", output], inputs=[output], compute_target=cpu_cluster, allow_reuse=True, ) pipeline = Pipeline(workspace=ws, steps=[preprocessing_step, pytorch_step]) run = Experiment(ws, args.experiment).submit(pipeline)
def create_estimator_from_configs( azure_config: AzureConfig, source_config: SourceConfig, estimator_inputs: List[DatasetConsumptionConfig]) -> PyTorch: """ Create an return a PyTorch estimator from the provided configuration information. :param azure_config: Azure configuration, used to store various values for the job to be submitted :param source_config: source configutation, for other needed values :param estimator_inputs: value for the "inputs" field of the estimator. :return: """ # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path entry_script_relative_path = Path(source_config.entry_script).relative_to( source_config.root_folder).as_posix() logging.info( f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to " f"source directory {source_config.root_folder})") environment_variables = { "AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC": str(source_config.upload_timeout_seconds), "MKL_SERVICE_FORCE_INTEL": "1", **(source_config.environment_variables or {}) } # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be # necessary if the innereye package is installed. It is necessary when working with an outer project and # InnerEye as a git submodule and submitting jobs from the local machine. # In case of version conflicts, the package version in the outer project is given priority. conda_dependencies = merge_conda_dependencies( source_config.conda_dependencies_files) # type: ignore if azure_config.pip_extra_index_url: # When an extra-index-url is supplied, swap the order in which packages are searched for. # This is necessary if we need to consume packages from extra-index that clash with names of packages on # pypi conda_dependencies.set_pip_option( f"--index-url {azure_config.pip_extra_index_url}") conda_dependencies.set_pip_option( "--extra-index-url https://pypi.org/simple") # create Estimator environment framework_version = pytorch_version_from_conda_dependencies( conda_dependencies) logging.info(f"PyTorch framework version: {framework_version}") max_run_duration = None if azure_config.max_run_duration: max_run_duration = run_duration_string_to_seconds( azure_config.max_run_duration) workspace = azure_config.get_workspace() estimator = PyTorch( source_directory=source_config.root_folder, entry_script=entry_script_relative_path, script_params=source_config.script_params, compute_target=azure_config.cluster, # Use blob storage for storing the source, rather than the FileShares section of the storage account. source_directory_data_store=workspace.datastores.get( WORKSPACE_DEFAULT_BLOB_STORE_NAME), inputs=estimator_inputs, environment_variables=environment_variables, shm_size=azure_config.docker_shm_size, use_docker=True, use_gpu=True, framework_version=framework_version, max_run_duration_seconds=max_run_duration) estimator.run_config.environment.python.conda_dependencies = conda_dependencies # We'd like to log the estimator config, but conversion to string fails when the Estimator has some inputs. # logging.info(azure_util.estimator_to_string(estimator)) if azure_config.hyperdrive: estimator = source_config.hyperdrive_config_func( estimator) # type: ignore return estimator
except ComputeTargetException: print('Creating a new compute target...') compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_NC6', max_nodes=1) compute_target_gpu = ComputeTarget.create(ws, cluster_name, compute_config) compute_target_gpu.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=0) script_params = {} estimator = PyTorch(source_directory='./', script_params=script_params, compute_target=compute_target_gpu, entry_script='train.py', use_gpu=True, pip_packages=[], framework_version='1.2') est_step = EstimatorStep(name="Train_Step", estimator=estimator, estimator_entry_script_arguments=auth_params, runconfig_pipeline_params=None, inputs=[], outputs=[], compute_target=compute_target_gpu) est_step.run_after(process_step) # step 3
inputs=[input_dir], outputs=[processed_dir], compute_target=cluster_name, runconfig=run_config, source_directory=PREPROCESS_DIR ) #%% [markdown] # ## Pipeline second step: training # # For the second step, we start by defining the pytorch estimator that will be used to traing the Stochastic variational deep kernel learning model using Gpytorch. #%% estimator = PyTorch(source_directory=TRAIN_DIR, conda_packages=['pandas', 'numpy', 'scikit-learn'], pip_packages=['gpytorch'], compute_target=cluster, entry_script='svdkl_entry.py', use_gpu=True) #%% [markdown] # Here, we configure Hyperdrive by defining the hyperparametes space and select choose Area under the curve as the metric to optimize for. #%% ps = RandomParameterSampling({ '--batch-size': choice(4096, 8192), '--epochs': choice(500), '--neural-net-lr': loguniform(-4, -2), '--likelihood-lr': loguniform(-4, -2), '--grid-size': choice(32, 64), '--grid-bounds': choice(-1, 0), '--latent-dim': choice(2),
'--momentum': 0.9, '--num-dataload-workers': 6, '--epochs-before-unfreeze-all': '0', # Don't unfreeze the model - since the performance degrades based on the number of images we have in the test set } conda_packages = ['pytorch', 'scikit-learn'] pip_packages = ['pydocumentdb', 'torchvision'] #%% estimator = PyTorch(source_directory='./aml-image-models', compute_target=ct, entry_script='train_network.py', script_params=script_params, node_count=1, process_count_per_node=1, conda_packages=conda_packages, pip_packages=pip_packages, use_gpu=True) #%% # Create Experiment object - this will be used to submit the Hyperdrive run and store all the given parameters experiment_hd = Experiment(workspace=ws, name='hyperdrive') #%% [markdown] ###### Create Random Parameter Sampler #%%