コード例 #1
0
def submit_for_inference(args: SubmitForInferenceConfig,
                         azure_config: AzureConfig) -> Optional[Path]:
    """
    Create and submit an inference to AzureML, and optionally download the resulting segmentation.
    :param azure_config: An object with all necessary information for accessing Azure.
    :param args: configuration, see SubmitForInferenceConfig
    :return: path to downloaded segmentation on local disc, or None if none.
    """
    logging.info(f"Building Azure configuration from {args.settings}")
    logging.info("Getting workspace")
    workspace = azure_config.get_workspace()
    logging.info("Identifying model")
    model = Model(workspace=workspace, id=args.model_id)
    model_id = model.id
    logging.info(f"Identified model {model_id}")
    source_directory = tempfile.TemporaryDirectory()
    source_directory_path = Path(source_directory.name)
    logging.info(
        f"Building inference run submission in {source_directory_path}")
    image_folder = source_directory_path / fixed_paths.DEFAULT_DATA_FOLDER
    image = copy_image_file(args.image_file, image_folder)
    model_sas_urls = model.get_sas_urls()
    # Identifies all the files with basename "environment.yml" in the model and downloads them.
    # These downloads should go into a temp folder that will most likely not be included in the model itself,
    # because the AzureML run will later download the model into the same folder structure, and the file names might
    # clash.
    temp_folder = source_directory_path / "temp_for_scoring"
    conda_files = download_files_from_model(model_sas_urls,
                                            ENVIRONMENT_YAML_FILE_NAME,
                                            dir_path=temp_folder)
    if not conda_files:
        raise ValueError(
            "At least 1 Conda environment definition must exist in the model.")
    # Copy the scoring script from the repository. This will start the model download from Azure, and invoke the
    # scoring script.
    entry_script = source_directory_path / Path(
        fixed_paths.RUN_SCORING_SCRIPT).name
    shutil.copyfile(
        str(
            fixed_paths.repository_root_directory(
                fixed_paths.RUN_SCORING_SCRIPT)), str(entry_script))
    source_config = SourceConfig(
        root_folder=source_directory_path,
        entry_script=entry_script,
        script_params={
            "--model-folder": ".",
            "--model-id": model_id,
            fixed_paths.SCORE_SCRIPT: "",
            # The data folder must be relative to the root folder of the AzureML job. test_image_files
            # is then just the file relative to the data_folder
            "--data_folder": image.parent.name,
            "--image_files": image.name
        },
        conda_dependencies_files=conda_files,
    )
    estimator = create_estimator_from_configs(azure_config, source_config, [])
    exp = Experiment(workspace=workspace, name=args.experiment_name)
    run = exp.submit(estimator)
    logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
    logging.info(f"Run URL: {run.get_portal_url()}")
    if not args.keep_upload_folder:
        source_directory.cleanup()
        logging.info(f"Deleted submission directory {source_directory_path}")
    if args.download_folder is None:
        return None
    logging.info("Awaiting run completion")
    run.wait_for_completion()
    logging.info(f"Run has completed with status {run.get_status()}")
    download_path = choose_download_path(args.download_folder)
    logging.info(f"Attempting to download segmentation to {download_path}")
    run.download_file(DEFAULT_RESULT_IMAGE_NAME, str(download_path))
    if download_path.exists():
        logging.info(f"Downloaded segmentation to {download_path}")
    else:
        logging.warning("Segmentation NOT downloaded")
    return download_path
コード例 #2
0
#Build Pipeline
pipeline = Pipeline(workspace=ws,
                    steps=[preprocessing_step, est_step, register_step])

#Validate pipeline
pipeline.validate()
print("Pipeline validation complete")

#Publish the pipeline
published_pipeline = pipeline.publish(
    name="MLOps_Pipeline_Estimator",
    description="MLOps pipeline for estimator",
    continue_on_step_failure=True)

#submit Pipeline
pipeline_run = exp.submit(pipeline, pipeline_parameters={})
print("Pipeline is submitted for execution")

#######################################################################################################
# Shows output of the run on stdout.
pipeline_run.wait_for_completion(show_output=True)

# Raise exception if run fails
if pipeline_run.get_status() == "Failed":
    raise Exception(
        "Training on local failed with following run status: {} and logs: \n {}"
        .format(pipeline_run.get_status(),
                pipeline_run.get_details_with_logs()))

# Writing the run id to /aml_config/run_id.json
'''
コード例 #3
0
def launch_experiment(ws, conf_aml, conf_cluster, conf_docker,
                      conf_experiment):

    # Register the input data blob container
    input_ds = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name='petridishdata',
        container_name='datasets',
        account_name='petridishdata',
        account_key=conf_aml['azure_storage_account_key'],
        create_if_not_exists=False)

    output_ds = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name='petridishoutput',
        container_name='amloutput',
        account_name='petridishdata',
        account_key=conf_aml['azure_storage_account_key'],
        create_if_not_exists=False)

    # Create or attach compute cluster
    # cluster_name = conf_cluster['cluster_name'] + datetime.datetime.now().strftime('%Y%m%d%I%M')
    cluster_name = conf_cluster['cluster_name']

    try:
        compute_target = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing compute target.')
    except:
        print('Creating a new compute target...')
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=conf_cluster['vm_size'],
            max_nodes=conf_cluster['max_nodes'],
            vm_priority=conf_cluster['vm_priority'],
            idle_seconds_before_scaledown=conf_cluster[
                'idle_seconds_before_scaledown'])

        # Create the cluster
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    # use get_status() to get a detailed status for the current cluster.
    print(compute_target.get_status().serialize())

    # Set project directory
    # Assuming running in extract_features_from_videos folder
    project_folder = '../'

    # Setup custom docker usage
    image_registry_details = ContainerRegistry()
    image_registry_details.address = conf_docker['image_registry_address']
    image_registry_details.username = conf_docker['image_registry_username']
    image_registry_details.password = conf_docker['image_registry_password']

    # don't let the system build a new conda environment
    user_managed_dependencies = True

    # Note that experiment names have to be
    # <36 alphanumeric characters
    exp_name = conf_experiment['experiment_name']
    experiment = Experiment(ws, name=exp_name)

    # TODO: Make config
    for i in tqdm(range(200)):
        log_dir = exp_name + f'_{i}'
        script_params = {
            '--nas.eval.loader.dataset.dataroot':
            input_ds.path('/').as_mount(),
            '--common.logdir':
            output_ds.path('/{}'.format(log_dir)).as_mount(),
        }

        est = Estimator(source_directory=project_folder,
                        script_params=script_params,
                        compute_target=compute_target,
                        entry_script='scripts/random/cifar_eval.py',
                        custom_docker_image=conf_docker['image_name'],
                        image_registry_details=image_registry_details,
                        user_managed=user_managed_dependencies,
                        source_directory_data_store=input_ds)

        run = experiment.submit(est)
コード例 #4
0
ファイル: 10-TrainOnLocal.py プロジェクト: hzjai0624/AIDevOps
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core import ScriptRunConfig
import os, json

# Workspaceの取得
ws = Workspace.from_config()

# Experimentの設定
experiment_name = 'devops0201'
exp = Experiment(workspace  = ws, name = experiment_name)
print(exp.name, exp.workspace.name, sep = '\n')

# 実行構成
run_config_user_managed = RunConfiguration()
run_config_user_managed.environment.python.user_managed_dependencies = True

# モデル学習コードの指定
src = ScriptRunConfig(source_directory = './code', script = 'training/train.py', run_config = run_config_user_managed)
print("モデル学習の実施")
run = exp.submit(src)

# 出力
run.wait_for_completion(show_output = True)

run_id={}
run_id['run_id'] = run.id
run_id['experiment_name'] = run.experiment.name
with open('aml_config/run_id.json', 'w') as outfile:
  json.dump(run_id,outfile)
コード例 #5
0
ファイル: run_experiment.py プロジェクト: ehrhorn/ds_envs
dataset_version = 1
arguments = [
    "--remote_debug",
    "--remote_debug_connection_timeout",
    300,
    "--remote_debug_client_ip",
    ip,
    "--remote_debug_port",
    5678,
    "--version",
    dataset_version,
]

env = Environment.get(workspace=ws, name="ds_envs")

src = ScriptRunConfig(
    source_directory=get_project_root() / "ds_envs" / "cloud",
    script="train.py",
    arguments=arguments,
    compute_target="local",
    environment=env,
)

experiment_name = "my_experiment"
experiment = Experiment(workspace=ws, name=experiment_name)

run = experiment.submit(config=src)

run.wait_for_completion(show_output=True)
コード例 #6
0
    source_directory=project_folder,
    allow_reuse=True,
    runconfig=amlcompute_run_config)

print("AutoML Training Step created.")

steps = [anom_detect, automl_step]
print("Step lists created")

pipeline = Pipeline(workspace=ws, steps=steps)
print("Pipeline is built")

pipeline.validate()
print("Pipeline validation complete")

pipeline_run = experiment.submit(pipeline)  #, regenerate_outputs=True)
print("Pipeline is submitted for execution")

# Wait until the run finishes.
pipeline_run.wait_for_completion(show_output=False)
print("Pipeline run completed")

# Download aml_config info and output of automl_step
def_data_store.download(target_path='.',
                        prefix='aml_config',
                        show_progress=True,
                        overwrite=True)

def_data_store.download(target_path='.',
                        prefix='outputs',
                        show_progress=True,
コード例 #7
0
    async def __create_cluster(self):
        # set up environment
        self.__print_message("Setting up cluster")

        # submit run
        self.__print_message("Submitting the experiment")

        exp = Experiment(self.workspace, self.experiment_name)
        estimator = Estimator(
            os.path.join(self.abs_path, "setup"),
            compute_target=self.compute_target,
            entry_script="start_scheduler.py",
            environment_definition=self.environment_definition,
            script_params=self.scheduler_params,
            node_count=1,  ### start only scheduler
            distributed_training=MpiConfiguration(),
            use_docker=True,
            inputs=self.datastores,
        )

        run = exp.submit(estimator)

        self.__print_message("Waiting for scheduler node's IP")

        while (
            run.get_status() != "Canceled"
            and run.get_status() != "Failed"
            and "scheduler" not in run.get_metrics()
        ):
            print(".", end="")
            logger.info("Scheduler not ready")
            time.sleep(5)

        if run.get_status() == "Canceled" or run.get_status() == "Failed":
            logger.exception("Failed to start the AzureML cluster")
            raise Exception("Failed to start the AzureML cluster.")

        print("\n\n")

        ### SET FLAGS
        self.scheduler_ip_port = run.get_metrics()["scheduler"]
        self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port
        self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}')
        self.run = run

        logger.info(f'Scheduler: {run.get_metrics()["scheduler"]}')

        ### CHECK IF ON THE SAME VNET
        while self.same_vnet is None:
            await self.sync(self.__check_if_scheduler_ip_reachable)
            time.sleep(1)

        ### REQUIRED BY dask.distributed.deploy.cluster.Cluster
        _scheduler = self.__prepare_rpc_connection_to_headnode()
        self.scheduler_comm = rpc(_scheduler)
        await self.sync(self.__setup_port_forwarding)
        await self.sync(super()._start)
        await self.sync(self.__update_links)

        self.__print_message("Connections established")
        self.__print_message(f"Scaling to {self.initial_node_count} workers")

        if self.initial_node_count > 1:
            self.scale(
                self.initial_node_count
            )  # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale`
        self.__print_message(f"Scaling is done")
コード例 #8
0
    '--lr_decay':
    loguniform(-9, -1)
})

policy = BanditPolicy(evaluation_interval=2,
                      slack_factor=0.1)  #, delay_evaluation=20)

hdc = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=ps,
                          policy=policy,
                          primary_metric_name='val_loss',
                          primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                          max_total_runs=5,
                          max_concurrent_runs=50)

hdr = exp.submit(config=hdc)

hdr.wait_for_completion(show_output=True)

best_run = hdr.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print(best_run)

# Writing the run id to /aml_config/run_id.json for use by a DevOps pipeline.
run_id = {}
run_id['run_id'] = best_run.id
run_id['experiment_name'] = best_run.experiment.name

# save run info
os.makedirs('aml_config', exist_ok=True)
with open('aml_config/run_id.json', 'w') as outfile:
コード例 #9
0
# %%
from azureml.train.dnn import PyTorch

estimator = PyTorch(source_directory=project_folder,
                    script_params={'--output-dir': './outputs'},
                    compute_target=compute_target,
                    entry_script='mnist.py',
                    use_gpu=False)

estimator.conda_dependencies.remove_conda_package('pytorch=0.4.0')
estimator.conda_dependencies.add_conda_package('pytorch-nightly')
estimator.conda_dependencies.add_channel('pytorch')

# %%
run = exp.submit(estimator)
run.wait_for_completion(show_output=True)

# %%
run.get_file_names()
model_path = os.path.join('outputs', 'mnist.onnx')
run.download_file(model_path, output_file_path=model_path)

# %%
model = run.register_model(model_name='mnist', model_path=model_path)
print(model.name, model.id, model.version, sep='\t')

# %%
models = ws.models
for name, m in models.items():
    print("Name:", name, "\tVersion:", m.version, "\tDescription:",
コード例 #10
0
def main():
    """
    Run the experiment for training
    """
    work_space = Workspace.from_config()

    # Set up the dataset for training
    datastore = work_space.get_default_datastore()
    dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist"))

    # Set up the experiment for training
    experiment = Experiment(workspace=work_space, name="keras-lenet-train")
    #     azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000
    config = ScriptRunConfig(
        source_directory=".",
        script="train_keras.py",
        compute_target="cpu-cluster",
        arguments=[
            "--data_folder",
            dataset.as_named_input("input").as_mount(),
        ],
    )

    # Set up the Tensoflow/Keras environment
    environment = Environment("keras-environment")
    environment.python.conda_dependencies = CondaDependencies.create(
        python_version="3.7.7",
        pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"])
    config.run_config.environment = environment

    # Run the experiment for training
    run = experiment.submit(config)
    aml_url = run.get_portal_url()
    print(
        "Submitted to an Azure Machine Learning compute cluster. Click on the link below"
    )
    print("")
    print(aml_url)

    tboard = Tensorboard([run])
    # If successful, start() returns a string with the URI of the instance.
    tboard.start(start_browser=True)
    run.wait_for_completion(show_output=True)
    # After your job completes, be sure to stop() the streaming otherwise it will continue to run.
    print("Press enter to stop")
    input()
    tboard.stop()

    # Register Model
    metrics = run.get_metrics()
    run.register_model(
        model_name="keras_mnist",
        tags={
            "data": "mnist",
            "model": "classification"
        },
        model_path="outputs/keras_lenet.h5",
        model_framework=Model.Framework.TENSORFLOW,
        model_framework_version="2.3.1",
        properties={
            "train_loss": metrics["train_loss"][-1],
            "train_accuracy": metrics["train_accuracy"][-1],
            "val_loss": metrics["val_loss"][-1],
            "val_accuracy": metrics["val_accuracy"][-1],
        },
    )
コード例 #11
0
ファイル: run-training.py プロジェクト: benc-uk/batcomputer
    runConfig.environment.python.interpreter_path = os.environ[
        'VIRTUAL_ENV'] + "/bin/python"
    print(
        f"### Will execute script {trainingScriptDir}/{trainingScript} on LOCAL compute"
    )

# Pass two args to the training script
scriptArgs = [
    "--data-path", "/tmp/" + dataPathRemote, "--estimators", estimators
]
scriptRunConf = ScriptRunConfig(source_directory=trainingScriptDir,
                                script=trainingScript,
                                arguments=scriptArgs,
                                run_config=runConfig)

run = exp.submit(scriptRunConf)
print(f"### Run '{run.id}' submitted and started...")
run.wait_for_completion(show_output=True, wait_post_processing=True)

# ===== Training Complete =====

if run.status == "Failed":
    print(f'### ERROR! Run did not complete. Training failed!')
    exit(1)

accuracy = run.get_metrics()['accuracy'] or 0.0

model = run.register_model(
    # NOTE! Must be called 'outputs' this is expected by training scripts and fetch model process
    model_path='outputs/',
    model_name=os.environ['AZML_MODEL'],
コード例 #12
0
ファイル: run_electra.py プロジェクト: PriyatamNayak/FLAML
from azureml.core import Workspace, Experiment, ScriptRunConfig
ws = Workspace.from_config()

compute_target = ws.compute_targets['V100-4']
# compute_target = ws.compute_targets['K80']
command = [
    "pip install torch transformers datasets flaml[blendsearch,ray] && ",
    "python test_electra.py"
]

config = ScriptRunConfig(
    source_directory='hf/',
    command=command,
    compute_target=compute_target,
)

exp = Experiment(ws, 'test-electra')
run = exp.submit(config)
print(run.get_portal_url())  # link to ml.azure.com
run.wait_for_completion(show_output=True)
コード例 #13
0
def main():

    parser = argparse.ArgumentParser(
        description="Run Elbencho on a BeeOND enabled cluster"
    )

    parser.add_argument("num_nodes", type=int, help="Number of nodes")
    parser.add_argument("--follow", action="store_true", help="Follow run output")
    parser.add_argument(
        "--keep-cluster",
        action="store_true",
        help="Don't autoscale cluster down when idle (after run completed)",
    )
    parser.add_argument(
        "--keep-failed-cluster", dest="terminate_on_failure", action="store_false"
    )

    parser.add_argument("--sharedfiles", action="store_false", dest="multifile")

    args = parser.parse_args()

    workspace = get_or_create_workspace(
        sharedconfig.subscription_id,
        sharedconfig.resource_group_name,
        sharedconfig.workspace_name,
        sharedconfig.location,
    )

    try:
        clusterconnector = create_or_update_cluster(
            workspace,
            sharedconfig.cluster_name,
            args.num_nodes,
            sharedconfig.ssh_key,
            sharedconfig.vm_type,
            terminate_on_failure=args.terminate_on_failure,
            use_beeond=True,
        )
    except RuntimeError:
        cprint("Fatal Error - exiting", "red", attrs=["bold"])
        sys.exit(-1)

    docker_args = [
        "-v",
        "{}:{}".format(clusterconnector.beeond_mnt, sharedconfig.beeond_map),
    ]

    # Get and update the AzureML Environment object
    environment = create_or_update_environment(
        workspace, sharedconfig.environment_name, sharedconfig.docker_image, docker_args
    )

    # Get/Create an experiment object
    experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name)

    # Configure the distributed compute settings
    parallelconfig = MpiConfiguration(
        node_count=args.num_nodes, process_count_per_node=1
    )

    if args.multifile:
        runscript = "./run_elbencho_multifile.sh"
    else:
        runscript = "./run_elbencho_largefile.sh"

    # Collect arguments to be passed to elbencho script
    script_args = [
        "bash",
        runscript,
        sharedconfig.beeond_map,
        str(args.num_nodes),
        *clusterconnector.ibaddrs,
    ]

    # Define the configuration for running the training script
    script_conf = ScriptRunConfig(
        source_directory="scripts",
        command=script_args,
        compute_target=clusterconnector.cluster,
        environment=environment,
        distributed_job_config=parallelconfig,
    )

    # We can use these tags make a note of run parameters (avoids grepping the logs)
    runtags = {
        "class": k_runclass,
        "vmtype": sharedconfig.vm_type,
        "num_nodes": args.num_nodes,
        "run_type": "multifile" if args.multifile else "sharedfile",
    }

    # Submit the run
    run = experiment.submit(config=script_conf, tags=runtags)

    # Can optionally choose to follow the output on the command line
    if args.follow:
        run.wait_for_completion(show_output=True)
コード例 #14
0
class AzureMLTrainer(trainer.Trainer):
    is_connected: bool = False
    __config_file: str = '.azureml/config.json'
    __workspace: Workspace = None
    __experiment: Experiment = None
    __current_experiment_name: str
    __current_run: Run = None
    __logger: Logger = None
    __vm_size_list: list = None

    def __init__(self, experiment_name: str, aml_workspace: Workspace, aml_run: Run = None):
        '''
        Initializes a new connected Trainer that will persist and log all runs on AzureML workspace
        Args:
            experiment_name (str): The name of the experiment that will be seen on AzureML
            aml_workspace (Workspace): The connected workspace on AzureML
        '''
        self.__workspace = aml_workspace
        self.__logger = logging.getLogger()
        if aml_run is not None:
            self.__current_run = aml_run
            self.__experiment = aml_run.experiment
            self.__current_experiment_name = aml_run.experiment.name
        else:
            self.__current_experiment_name = experiment_name
            self.__experiment = Experiment(workspace=self.__workspace, name=experiment_name)


    @classmethod
    def CreateFromContext(cls):
        '''
        Creates a Trainer, based on the current Run context.  This will only work when used in an Estimator
        Returns: 
            AzureMLTrainer: an instance of AzureMLTrainer allowing the user to work connected.
        '''   
        run = Run.get_context()
        return cls(run.experiment.name, run.experiment.workspace, run)


    def new_run(self, description: str = None, copy_folder: bool = True, metrics: dict = None) -> Run:
        '''
        This will begin a new interactive run on the existing AzureML Experiment.  When a previous run was still active, it will be completed.
        Args:
            description (str): An optional description that will be added to the run metadata
            copy_folder (bool): Indicates if the output folder should be snapshotted and persisted
            metrics (dict): The metrics that should be logged in the run already
        Returns:
            Run: the AzureML Run object that can be used for further access and custom logic
        '''
        if(self.__current_run is not None):
            self.__current_run.complete()
        if(copy_folder):
            self.__current_run = self.__experiment.start_logging()
        else:
            self.__current_run = self.__experiment.start_logging(snapshot_directory = None)

        if(metrics is not None):
            for k, v in metrics.items():
                self.__current_run.log(k, v)

        if(description is not None):
            self.__current_run.log('Description', description)
        
        return self.__current_run

    def add_tuning_result(self, run_index: int, train_score: float, test_score: float, sample_count: int, durations:np.array, parameters: dict, estimator):
        '''
        This add results of a cross validation fold to the child run in a Grid Search
        Args:
            train_score (float): The given score of the training data
            test_score (float): The given score of the test data
            sample_count (int): The number of samples that were part of a fold
            durations (np.array): The different durations of the Grid Search
            parameters (dict): The parameter combinations that have been tested in this cross validation fold
            estimate (model): The actual fitted estimator / model that was trained in this fold
        '''
        _child_run = self.__current_run.child_run('Gridsearch' + str(run_index))
        self.__current_run.log_row('Trainscore', score = train_score)
        self.__current_run.log_row('Testscore', score = test_score)

        _table = {
            'Testing score': test_score,
            'Training score': train_score
            }

        for k in parameters.keys():
            v = parameters[k]
            if(v is None):
                v = 'None'
            _child_run.log(k, v)
            _table[k] = v
        
        self.__current_run.log_row('Results', '', **_table)
        _child_run.complete()


    def get_best_model(self, metric_name:str, take_highest:bool = True):
        '''
        Tags and returns the best model of the experiment, based on the given metric
        Args:
            metric_name (str): The name of the metric, such as accuracy
            take_highest (bool): In case of accuracy and score, this is typically True.  In case you want to get the model based on the lowest error, you can use False
        Returns:
            Run: the best run, which will be labeled as best run
        '''
        runs = {}
        run_metrics = {}
        for r in tqdm(self.__experiment.get_runs()):
            metrics = r.get_metrics()
            if metric_name in metrics.keys():
                runs[r.id] = r
                run_metrics[r.id] = metrics
        best_run_id = min(run_metrics, key = lambda k: run_metrics[k][metric_name])
        best_run = runs[best_run_id]
        best_run.tag('Best run')
        return best_run

    def get_azureml_experiment(self):
        '''
        Gives access to the AzureML experiment object
        Returns:
            Experiment: the existing experiment
        '''
        return self.__experiment
        
    def complete_run(self, fitted_model, metrics_to_log: dict = None, upload_model: bool = True):
        '''
        Saves all results of the active Run and completes it
        Args:
            fitted_model (model): The already fitted model to be tested.  Sklearn and Keras models have been tested
            metrics_to_log (dict): The metrics that should be logged with the model to the run
            upload_model (bool): This will upload the model (pkl file or json) to AzureML run (defaults to True)
        '''
        is_keras = 'keras' in str(type(fitted_model))

        if(metrics_to_log is not None):
            for k, v in metrics_to_log.items():
                self._log_metrics(k, v)
        
        if upload_model:
            # Save the model to the outputs directory for capture
            if(is_keras):
                model_folder_name = 'outputs/model'
                fitted_model.save(model_folder_name)
                files_to_upload = dict()
            else:
                model_file_name = 'outputs/model.pkl'
                joblib.dump(value = fitted_model, filename = model_file_name)

        self._complete_run()

    def evaluate_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, save_curves_as_image: bool = False,
                             class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array:

        '''
        Will predict and evaluate a model against a test set and save all results to the active Run on AzureML
        Args:
            fitted_model (model): The already fitted model to be tested.  Sklearn and Keras models have been tested
            X_test (np.array): The test set to calculate the predictions with
            y_test (np.array): The output test set to evaluate the predictions against
            show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier
            save_curves_as_image (bool): This will save the training & loss curves as images
            class_names (np.array): The class names that will be linked to the Confusion Matrix.  If not provided, the unique values of the y_test matrix will be used
            finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True)
            upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True)
            return_predictions (bool): If true, the y_pred values will be returned
        Returns: 
            np.array: The predicted (y_pred) values against the model
        '''
        is_keras = 'keras' in str(type(fitted_model))
        
        # Predict X_test with model
        if(is_keras):
            if 'predict_classes' in dir(fitted_model):
                y_pred = fitted_model.predict_classes(X_test)
            else:
                y_pred = fitted_model.predict(X_test)
                y_pred = np.argmax(y_pred, axis=1)
            self.add_training_plots(fitted_model, save_image=save_curves_as_image)
        else:
            y_pred = fitted_model.predict(X_test)

        if class_names is None:
            class_names = np.char.mod('%d', sorted(np.unique(y_test)))

        # Print classification report
        print(metrics.classification_report(y_test, y_pred))

        # Confusion matrix
        cf = metrics.confusion_matrix(y_test, y_pred)
        self._log_confmatrix(cf, class_names)

        # Accuracy
        accuracy = metrics.accuracy_score(y_test, y_pred) * 100
        self._log_metrics('accuracy', accuracy, description='')

        if(show_roc == True):
            # Verify that we are having a binary classifier
            if(len(class_names)!=2):
                raise AttributeError('Showing a ROC curve is only possible for binary classifier, not for multi class')
            self.__log_roc_curve(y_test, y_pred) 

        if (finish_existing_run):
            self.complete_run(fitted_model, upload_model = upload_model)

        if return_predictions:  
            return y_pred

    def add_training_plots(self, fitted_model, metrics=None, save_image: bool = False):
        '''
        Add the training plots to the Run history
        Args:
            fitted_model (Keras model): the fitted model that contains the training history
            metrics (list): the metrics that should be tracked to the run.  If None, all available metrics will be taken
        
        '''
        history = fitted_model.history
        if metrics is None:
            metrics = history.history.keys()

        for metric in metrics:
            if(metric in history.history.keys()):
                self.__current_run.log_table(f'Plot {metric}', {metric: history.history[metric]})

                if(save_image and not metric.startswith('val_') and metric in history.history.keys()):
                    plt.plot(history.history[metric])
                    plt.plot(history.history[f'val_{metric}'])
                    plt.title(f'model {metric}')
                    plt.ylabel(metric)
                    plt.xlabel('epoch')
                    plt.legend(['train', 'test'], loc='upper left')
                    #plt.show()
                    self.__current_run.log_image(f'model {metric}', plot=plt)
                    plt.close()

    def evaluate_image_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, failed_classifications_to_save: int = 0, image_shape = None, save_curves_as_image: bool = False,
                                class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array:

        '''
        Will predict and evaluate a model against a test set and save all results to the active Run on AzureML
        Args:
            fitted_model (model): The already fitted model to be tested.  Sklearn and Keras models have been tested
            X_test (np.array): The test set to calculate the predictions with
            y_test (np.array): The output test set to evaluate the predictions against
            show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier
            failed_classifications_to_save (int): If greather than 0, this amount of incorrectly classified images will be tracked to the Run
            image_shape ((int, int, int)): Indicates if images should be reshaped before saving them
            class_names (np.array): The class names that will be used in the description.  If not provided, the unique values of the y_test matrix will be used
            finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True)
            upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True)
        Returns: 
            np.array: The predicted (y_pred) values against the model
        ''' 
        from arcus.ml.images import explorer
        
        y_pred = self.evaluate_classifier(fitted_model, X_test, y_test, show_roc=show_roc, save_curves_as_image=save_curves_as_image, class_names= class_names, finish_existing_run=False, upload_model=upload_model, return_predictions=True)
        if failed_classifications_to_save > 0:
            # Take incorrect classified images and save
            import random
            incorrect_predictions = [i for i, item in enumerate(y_pred) if item != y_test[i]]
            total_images = min(len(incorrect_predictions), failed_classifications_to_save)

            for i in random.sample(incorrect_predictions, total_images):
                pred_class = y_pred[i]
                act_class = y_test[i]
                if class_names is not None:
                    pred_class = class_names[pred_class]
                    act_class = class_names[act_class]
                if image_shape is not None:
                    # Reshape image before saving it
                    imgplot = explorer.show_image(X_test[i].reshape(image_shape), silent_mode=True)
                else:
                    imgplot = explorer.show_image(X_test[i], silent_mode=True)
                description = f'Predicted {pred_class} - Actual {act_class}'
                self.__current_run.log_image(description, plot=imgplot)

        if return_predictions:  
            return y_pred




    def __stack_images(self, img1: np.array, img2: np.array):
        ha,wa = img1.shape[:2]
        hb,wb = img2.shape[:2]
        max_width = np.max([wa, wb])
        total_height = ha+hb
        new_img = np.zeros(shape=(total_height, max_width, 3))
        new_img[:ha,:wa]=img1
        new_img[ha:hb+ha,:wb]=img2
        return new_img

    def __concat_images(self, image_list: np.array) -> np.array:
        output = None
        for i, img in enumerate(image_list):
            if i==0:
                output = img
            else:
                output = self.__stack_images(output, img)
        return output

 

    def save_image_outputs(self, X_test: np.array, y_test: np.array, y_pred: np.array, samples_to_save: int = 1) -> np.array:
        '''
        Will save image outputs to the run
        Args:
            X_test (np.array): The input images for the model
            y_test (np.array): The actual expected output images of the model
            y_pred (np.array): The predicted or calculated output images of the model
            samples_to_save (int): If greather than 0, this amount of input, output and generated image combinations will be tracked to the Run
        ''' 
        from arcus.ml.images import explorer

        if samples_to_save > 0:
            import random
            total_images = min(len(y_pred), samples_to_save)

            for i in random.sample(range(len(y_pred)), total_images):
                newimg = self.__concat_images([X_test[i], y_test[i], y_pred[i]])
                imgplot = explorer.show_image(newimg, silent_mode=True)
                self.__current_run.log_image(f'Image combo sample {i}', plot=imgplot)
                imgplot.close()

    def setup_training(self, training_name: str, overwrite: bool = False):
        '''
        Will initialize a new directory (using the given training_name) and add a training script and requirements file to run training
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            overwrite (bool): Defines if the existing training files should be overwritten
        '''
        if not os.path.exists(training_name):
            os.makedirs(training_name)
        # Take default training script and copy to the new folder
        default_training_script_file = os.path.join(str(os.path.dirname(__file__)), 'resources/train.py')
        default_requirements_file = os.path.join(str(os.path.dirname(__file__)), 'resources/requirements.txt')
        dest_training_script_file = os.path.join(training_name, 'train.py')
        dest_requirements_file = os.path.join(training_name, 'requirements.txt')

        if overwrite or not(os.path.isfile(dest_training_script_file)):
            shutil.copy2(default_training_script_file, training_name)

        if overwrite or not(os.path.isfile(dest_requirements_file)):
            shutil.copy2(default_requirements_file, training_name)
        
    def start_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, 
                        input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, 
                        script_parameters: dict = None, show_widget: bool = True, use_estimator: bool = False, **kwargs):
        ''' 
        Will start a new training, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        Returns:
            Run : the submitted run
        '''
        
        if use_estimator:
            print('Scheduling Estimator training')
            self._start_estimator_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs)
        else:
            print('Scheduling ScriptRunConfig training')
            self._start_environment_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs)
        
        if script_parameters is not None:
            for arg in script_parameters.keys():
                self.__current_run.log(arg.replace('--', ''), script_parameters[arg])

        print(self.__current_run.get_portal_url())

        if(show_widget):
            from azureml.widgets import RunDetails
            RunDetails(self.__current_run).show()
        return self.__current_run

    def _start_environment_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, 
                                    input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, 
                                    script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using ScriptRunConfig, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator
        from azureml.core import Environment, ScriptRunConfig
        from azureml.core.runconfig import RunConfiguration
        from azureml.core.runconfig import DataReferenceConfiguration
        from azureml.core.runconfig import CondaDependencies
        from arcus.azureml.experimenting import train_environment as te

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)

        training_env = te.get_training_environment(self.__workspace, training_name, os.path.join(training_name, 'requirements.txt'), use_gpu=gpu_compute, include_prerelease=True, environment_type=environment_type)
        runconfig = RunConfiguration()

        # Add datasets
        datarefs = dict()
        
        scriptargs = list()
        if script_parameters is not None:
           for key in script_parameters.keys():
               scriptargs.append(key)
               scriptargs.append(script_parameters[key])

        if(input_datasets is not None):
            for ds in input_datasets:
                print(f'Adding mounting data reference for dataset {ds}')
                # scriptargs.append(ds)
                scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute = ds))
#                datastore, path = self._get_data_reference(self.__workspace.datasets[ds])
#                datarefs[ds] = DataReferenceConfiguration(datastore_name=datastore, path_on_datastore = path, path_on_compute = '/' + ds, mode = 'mount', overwrite = False)
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                print(f'Adding download data reference for dataset {ds}')
                # scriptargs.append(ds)
                scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute = ds))



        scriptrunconfig = ScriptRunConfig(source_directory='./' + training_name, script="train.py", run_config=runconfig, 
                                            arguments=scriptargs)
        scriptrunconfig.run_config.target = compute_target
        scriptrunconfig.run_config.environment = training_env
        #scriptrunconfig.run_config.data_references = datarefs

        # Submit training
        self.__current_run = self.__experiment.submit(scriptrunconfig)
        


    def _get_data_reference(self, dataset: Dataset):
        import json
        j = json.loads(str(dataset).replace('FileDataset\n', ''))
        source = j['source'][0]
        sections = source.split("'")
        return sections[1], sections[3]

    def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using an Estimator, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)
            

        # Add datasets
        datasets = list()
        if(input_datasets is not None):
            for ds in input_datasets:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds))
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds))

        # as mount - as download
        constructor_parameters = {
            'source_directory':training_name,
            'script_params':script_parameters,
            'inputs':datasets,
            'compute_target':compute_target,
            'entry_script':'train.py',
            'pip_requirements_file':'requirements.txt', 
            'use_gpu':gpu_compute,
            'use_docker':True}
        
        print('Creating estimator of type', estimator_type)

        if(estimator_type is None):
            # Using default Estimator
            estimator = Estimator(**constructor_parameters)
        elif(estimator_type == 'tensorflow'):
            from azureml.train.dnn import TensorFlow
            version_par = 'framework_version'
            if(not version_par in constructor_parameters.keys()):
                print('Defaulting to version 2.0 for TensorFlow')
                constructor_parameters[version_par] = '2.0'
            estimator = TensorFlow(**constructor_parameters)
        elif(estimator_type == 'sklearn'):
            from azureml.train.sklearn import SKLearn
            estimator = SKLearn(**constructor_parameters)
        elif(estimator_type == 'pytorch'):
            from azureml.train.dnn import PyTorch
            estimator = PyTorch(**constructor_parameters)

        # Submit training
        self.__current_run = self.__experiment.submit(estimator)

    # protected implementation methods
    def _log_metrics(self, metric_name: str, metric_value: float, description:str = None):
        print(metric_name, metric_value) 

        self.__current_run.log(metric_name, metric_value, description=description)

    
    def _complete_run(self):
        '''
        Completes the current run
        '''
        self.__current_run.complete()

    def _log_confmatrix(self, confusion_matrix: np.array, class_names: np.array):
        data = {}
        data['schema_type'] = 'confusion_matrix'
        data['schema_version'] = 'v1'
        data['data'] = {}
        data['data']['class_labels'] = class_names.tolist()
        data['data']['matrix'] = confusion_matrix.tolist()
        
        print(confusion_matrix)

        json_data = json.dumps(data)
        self.__current_run.log_confusion_matrix('Confusion matrix', json_data, description='')

    def _save_roc_curve(self, roc_auc: float, roc_plot: plt):
        self._log_metrics('roc_auc', roc_auc)
        self.__current_run.log_image('ROC Curve', plot=plt)

    def __check_compute_target(self, compute_target, use_gpu: bool):
        __vm_size = ''
        if isinstance(compute_target, AmlCompute):
            __vm_size = compute_target.vm_size
        elif isinstance(compute_target, str):
            compute = ComputeTarget(workspace=self.__workspace, name=compute_target)
            __vm_size = compute.vm_size

        if self.__vm_size_list is None:
            self.__vm_size_list = AmlCompute.supported_vmsizes(self.__workspace)
        
        vm_description = list(filter(lambda vmsize: str.upper(vmsize['name']) == str.upper(__vm_size), self.__vm_size_list))[0]
        if(use_gpu and vm_description['gpus'] == 0):
            raise errors.TrainingComputeException(f'gpu_compute was specified, but the target does not have GPUs: {vm_description} ')
        if(not (use_gpu) and vm_description['vCPUs'] == 0):
            raise errors.TrainingComputeException(f'cpu_compute was specified, but the target does not have CPUs: {vm_description} ')


    def __log_roc_curve(self, y_pred: np.array, y_test: np.array):
        '''Will upload the Receiver Operating Characteristic (ROC) Curve for binary classifiers

        Args:
            y_pred (np.array): The predicted values of the test set 
            y_test (np.array): The actual outputs of the test set

        Returns: 
            float: The ROC_AUC value
        '''
        # calculate the fpr and tpr for all thresholds of the classification
        fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
        roc_auc = metrics.auc(fpr, tpr)
        plt.cla()
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        self._save_roc_curve(roc_auc, plt)
        plt.show(block=False)
        plt.close()
        return roc_auc
コード例 #15
0
def create_and_submit_experiment(workspace: Workspace,
                                 azure_config: AzureConfig,
                                 source_config: SourceConfig,
                                 model_config_overrides: str,
                                 azure_dataset_id: str) -> Run:
    """
    Creates an AzureML experiment in the provided workspace and submits it for execution.
    :param workspace: configured workspace to use to run the experiment in
    :param azure_config: azure related configurations to setup valid workspace
    :param source_config: The information about which code should be submitted, and which arguments should be used.
    :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
     arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run.
    :returns: Run object for the submitted AzureML run
    """
    experiment_name = create_experiment_name(azure_config)
    exp = Experiment(workspace=workspace,
                     name=azure_util.to_azure_friendly_string(experiment_name))
    pt_env = create_pytorch_environment(workspace, azure_config, source_config,
                                        azure_dataset_id)

    # submit a training/testing run associated with the experiment
    run: Run = exp.submit(pt_env)

    # set metadata for the run
    set_run_tags(run, azure_config, model_config_overrides)

    print("\nSuccessfully queued new run for experiment: {}".format(exp.name))
    print(
        "=============================================================================="
    )

    if azure_config.run_recovery_id:
        print(f"\nRecovered from: {azure_config.run_recovery_id}")

    recovery_id = azure_util.create_run_recovery_id(run)
    recovery_file = Path(RUN_RECOVERY_FILE)
    if recovery_file.exists():
        recovery_file.unlink()
    recovery_file.write_text(recovery_id)

    print(
        "=============================================================================="
    )
    print("Experiment URL: {}".format(exp.get_portal_url()))
    print("Run URL: {}".format(run.get_portal_url()))
    print(
        "If this run fails, re-start runner.py and supply these additional arguments: "
        f"--run_recovery_id={recovery_id}")
    print(
        f"The run recovery ID has been written to this file: {recovery_file}")
    print(
        "=============================================================================="
    )
    if azure_config.tensorboard and azure_config.azureml:
        print("Starting TensorBoard now because you specified --tensorboard")
        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]),
                azure_config=azure_config)
    else:
        print(
            f"To monitor this run locally using TensorBoard, run the script: "
            f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
        print(
            "=============================================================================="
        )
    return run
コード例 #16
0
# Configuring a PythonScriptStep with a RunConfiguration
# that includes debugpy and azure-debug-relay

run_config = RunConfiguration()
conda_dependencies = run_config.environment.python.conda_dependencies
conda_dependencies.add_conda_package("pip")
conda_dependencies.add_conda_package("scikit-learn")
conda_dependencies.add_pip_package("azureml-sdk==" + amlcore.__version__)
conda_dependencies.add_pip_package("azureml-defaults")

train_step = PythonScriptStep(name='Train Step with Debugging',
                              script_name="diabetes_train_2.py",
                              source_directory="./scripts",
                              compute_target=compute_target,
                              runconfig=run_config,
                              allow_reuse=False)

print('About to submit')

# Submitting an Azure ML Pipeline Run
step_sequence = StepSequence(steps=[train_step])
pipeline = Pipeline(workspace, steps=step_sequence)
experiment = Experiment(workspace=workspace, name=experiment_name)
run = experiment.submit(pipeline)
print('submitted')
# Show the running experiment run in the notebook widget
#RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()
コード例 #17
0
        est = ScriptRunConfig(
            source_directory=os.path.dirname(os.path.realpath(__file__)),
            arguments=[
                "--models", models, '--data_folder_train',
                'DatasetConsumptionConfig:{}'.format(input_name_train),
                '--data_folder_test',
                'DatasetConsumptionConfig:{}'.format(input_name_test),
                '--local', 'no'
            ],
            run_config=run_config)

    # Define the ML experiment
    experiment = Experiment(workspace, "explore_" + models)
    # Submit experiment run, if compute is idle, this may take some time')
    run = experiment.submit(est)

if models == 'deeplearning':
    dataset_train = Dataset.get_by_name(workspace, name=input_name_train)
    dataset_test = Dataset.get_by_name(workspace, name=input_name_test)

    # define script parameters
    script_params_3 = {
        '--models': models,
        '--data_folder_train':
        dataset_train.as_named_input('train').as_mount(),
        '--data_folder_test': dataset_test.as_named_input('test').as_mount(),
        '--local': 'no'
    }

    estimator = PyTorch(
コード例 #18
0
ファイル: run_azure.py プロジェクト: jkhouja/experimenter
    def my_azure_app(cfg: DictConfig) -> None:
        print(cfg.pretty())
        args_dict = OmegaConf.to_container(cfg, resolve=False)

        yaml_file_nm = args_dict["yaml_file"].split("/")[-1].split(".")[0]
        conf_file = os.path.join(
            args_dict["root_path"],
            yaml_file_nm + "_" + str(datetime.datetime.now()) + ".json",
        )
        print(conf_file)

        with open(conf_file, "w") as out:
            out.write(json.dumps(args_dict))

        # First, list the supported VM families for Azure Machine Learning Compute
        # ws = Workspace.get('experiments')
        cluster_name = "gpucluster"
        experiment_name = args_dict["experiment_name"] + "_azure"
        disable_gpu = args_dict["disable_gpu"]
        script_folder = "."  # todo. this is overriden by hydra
        script_folder = (hydra.utils.get_original_cwd()
                         )  # todo. this is overriden by hydra
        data_path = os.path.join(args_dict["root_path"],
                                 args_dict["data_subdir"])

        sub_id = os.getenv("AZ_SUBS_ID")

        assert sub_id is not None
        # Edit a run configuration property on the fly.
        run_local = RunConfiguration()
        run_local.environment.python.user_managed_dependencies = True

        ws = Workspace.get(
            name="experiments",
            subscription_id=sub_id,
            resource_group="default_resource_group",
        )

        # print(AmlCompute.supported_vmsizes(workspace=ws))

        # Create a new runconfig object
        _ = RunConfiguration()

        # Signal that you want to use AmlCompute to execute the script
        # run_temp_compute.target = "amlcompute"

        # AmlCompute is created in the same region as your workspace
        # Set the VM size for AmlCompute from the list of supported_vmsizes

        try:
            compute_target = ComputeTarget(workspace=ws, name=cluster_name)
            print("Found existing compute target")
        except ComputeTargetException:
            print("Creating a new compute target...")
            compute_config = AmlCompute.provisioning_configuration(
                vm_size=args_dict["vm_size"], max_nodes=1)

            compute_target = ComputeTarget.create(ws, cluster_name,
                                                  compute_config)
            compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=10)

        s = ws.get_default_datastore()

        # A reference to the root_path in azure after uplaoding
        _ = s.upload(
            src_dir=data_path,
            target_path=data_path,
            overwrite=False,
            show_progress=True,
        )

        # All path except file_name
        # script_target_path = "/".join(args_dict['yaml_file'].split("/")[:-1])
        script_target_path = "/".join(
            conf_file.split("/")[:-1])  # All path except file_name
        print(script_target_path)
        # script_fname = args.config_file.split("/")[-1]
        script_fname = conf_file.split("/")[-1]
        print(script_fname)
        print("---" * 100)

        azure_script_path = s.upload_files(
            files=[conf_file],
            target_path=script_target_path,
            overwrite=True,
            show_progress=True,
        )

        print(azure_script_path)

        azure_script_abs_path = DataReference(datastore=s,
                                              data_reference_name="input_data",
                                              path_on_datastore=conf_file)

        azure_root_path = DataReference(
            datastore=s,
            data_reference_name="root_data",
            path_on_datastore=args_dict["root_path"],
        )

        exp = Experiment(workspace=ws, name=experiment_name)

        # src = ScriptRunConfig(source_directory = script_folder,
        # script = 'run.py', arguments=['--config_file', 'local/pairs.json'],
        # run_config = run_temp_compute)

        # Using pytorch estimator - proper way to submit pytorch jobs
        script_params = {
            "--config_file": azure_script_abs_path,
            "--root_path": azure_root_path,
            "--experiment_name": experiment_name,
        }

        print("GPU Disabled: {}".format(disable_gpu))

        estimator = PyTorch(
            source_directory=script_folder,
            script_params=script_params,
            compute_target=compute_target,
            entry_script="run.py",
            use_gpu=not disable_gpu,
            pip_packages=["pillow==5.4.1"],
        )

        # you can name this as run
        _ = exp.submit(estimator)
コード例 #19
0
script_params = {
    '--data-folder': ws.get_default_datastore().as_mount(),
    '--batch-size': 50,
    '--first-layer-neurons': 300,
    '--second-layer-neurons': 100,
    '--learning-rate': 0.01
}

est = TensorFlow(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target,
                 entry_script='tf_mnist.py', 
                 use_gpu=True, 
                 framework_version='1.12')

run = exp.submit(est)

run.wait_for_completion(show_output=True, wait_post_processing=True)

# Raise exception if run fails
if run.get_status() == "Failed":
    raise Exception(
        "Training on local failed with following run status: {} and logs: \n {}".format(
            run.get_status(), run.get_details_with_logs()
        )
    )

# Writing the run id to /aml_config/run_id.json

run_id = {}
run_id["run_id"] = run.id
コード例 #20
0
                           entry_script='retrain.py',
                           pip_packages=['tensorflow_hub'],
                           node_count=1,
                           use_gpu=True)

    # Overwrite data store reference
    dr = DataReferenceConfiguration(
        datastore_name=ds.name,
        path_on_datastore='flower_photos',
        mode='download',  # download files from datastore to compute target
        overwrite=True)
    estimator.run_config.data_references[ds.name] = dr

    # Submit Experiment
    print("Training the model...")
    run = experiment.submit(estimator)
    run.wait_for_completion(show_output=True)

    print("Waiting for the run to complete...")
    status = run.get_status()
    while status != 'Completed' and status != 'Failed':
        print('current status: {} - waiting...'.format(run.get_status()))
        time.sleep(30)
        status = run.get_status()

    # Download results
    print("Downloading the results...")
    for filename in run.get_file_names():
        if filename.startswith('outputs'):
            print("downloading", filename, '...')
            run.download_file(filename,
コード例 #21
0
# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")

# Set compute target to the Linux DSVM
conda_run_config.target = dsvm_compute.name

# set the data reference of the run coonfiguration
conda_run_config.data_references = {ds.name: dr}

# specify conda packages to install on the VM
conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=ast.literal_eval(config['train']['conda_packages']))

from azureml.core import Run
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(
    source_directory='./',
    script=config['train']['script'],
    run_config=conda_run_config,
    # pass the datastore reference as a parameter to the training script
    arguments=['--data-folder', str(ds.as_download())])
run = exp.submit(config=src)
run.wait_for_completion(show_output=True)

# Register the model
print('Registering model...')
model = run.register_model(model_name=config['train']['model_name'],
                           model_path='./outputs/ridge_1.pkl')
print('Done registering model.')
コード例 #22
0
# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an estimator
estimator = Estimator(source_directory=experiment_folder,
                      inputs=[diabetes_ds.as_named_input('diabetes')],
                      script_params=script_params,
                      compute_target = 'local',
                      environment_definition = diabetes_env,
                      entry_script='diabetes_training.py')

# Create an experiment
experiment = Experiment(workspace = ws, name = 'diabetes-training')

# Run the experiment
run = experiment.submit(config=estimator)
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

# Register the environment
diabetes_env.register(workspace=ws)


#run on remote compute

#check for existing

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
コード例 #23
0
def main(req: func.HttpRequest) -> (func.HttpResponse):
    logging.info('Python HTTP trigger function processed a request.')

    # For now this can be a POST where we have <base url>/api/HttpTrigger?start=<any string>
    image_url = req.params.get('start')
    logging.info(type(image_url))

    # Use service principal secrets to create authentication vehicle and 
    # define workspace object
    try:    
        svc_pr = ServicePrincipalAuthentication(
            tenant_id=os.getenv('TENANT_ID', ''),
            service_principal_id=os.getenv('APP_ID', ''),
            service_principal_password=os.getenv('PRINCIPAL_PASSWORD', ''))

        ws = Workspace(subscription_id=os.getenv('AZURE_SUB', ''),
                    resource_group=os.getenv('RESOURCE_GROUP', ''),
                    workspace_name=os.getenv('WORKSPACE_NAME',''),
                    auth=svc_pr)
        print("Found workspace {} at location {} using Azure CLI \
            authentication".format(ws.name, ws.location))
    # Usually because authentication didn't work
    except ProjectSystemException as err:
        print('Authentication did not work.')
        return json.dumps('ProjectSystemException')
    # Need to create the workspace
    except Exception as err:
        ws = Workspace.create(name=os.getenv('WORKSPACE_NAME', ''),
                    subscription_id=os.getenv('AZURE_SUB', ''), 
                    resource_group=os.getenv('RESOURCE_GROUP', ''),
                    create_resource_group=True,
                    location='westus', # Or other supported Azure region   
                    auth=svc_pr)
        print("Created workspace {} at location {}".format(ws.name, ws.location))

       

    # choose a name for your cluster - under 16 characters
    cluster_name = "gpuforpytorch"

    try:
        compute_target = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing compute target.')
    except ComputeTargetException:
        print('Creating a new compute target...')
        # AML Compute config - if max_nodes are set, it becomes persistent storage that scales
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                            min_nodes=0,
                                                            max_nodes=2)
        # create the cluster
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    # use get_status() to get a detailed status for the current cluster. 
    # print(compute_target.get_status().serialize())

    # # Create a project directory and copy training script to ii
    project_folder = os.path.join(os.getcwd(), 'HttpTrigger', 'project')
    # os.makedirs(project_folder, exist_ok=True)
    # shutil.copy(os.path.join(os.getcwd(), 'HttpTrigger', 'pytorch_train.py'), project_folder)

    # Create an experiment
    experiment_name = 'fish-no-fish'
    experiment = Experiment(ws, name=experiment_name)

    # Use an AML Data Store for training data
    ds = Datastore.register_azure_blob_container(workspace=ws, 
        datastore_name='funcdefaultdatastore', 
        container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), 
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Use an AML Data Store to save models back up to
    ds_models = Datastore.register_azure_blob_container(workspace=ws, 
        datastore_name='modelsdatastorage', 
        container_name=os.getenv('STORAGE_CONTAINER_NAME_MODELS', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), 
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Set up for training ("trans" flag means - use transfer learning and 
    # this should download a model on compute)
    # Using /tmp to store model and info due to the fact that
    # creating new folders and files on the Azure Function host
    # will trigger the function to restart.
    script_params = {
        '--data_dir': ds.as_mount(),
        '--num_epochs': 30,
        '--learning_rate': 0.01,
        '--output_dir': '/tmp/outputs',
        '--trans': 'True'
    }

    # Instantiate PyTorch estimator with upload of final model to
    # a specified blob storage container (this can be anything)
    estimator = PyTorch(source_directory=project_folder, 
                        script_params=script_params,
                        compute_target=compute_target,
                        entry_script='pytorch_train.py',
                        use_gpu=True,
                        inputs=[ds_models.as_upload(path_on_compute='./outputs/model_finetuned.pth')])

    run = experiment.submit(estimator)
    print(run.get_details())
    
    # # The following would certainly be blocking, but that's ok for debugging
    # while run.get_status() not in ['Completed', 'Failed']: # For example purposes only, not exhaustive
    #    print('Run {} not in terminal state'.format(run.id))
    #    time.sleep(10)

    return json.dumps(run.get_status())
コード例 #24
0
ファイル: azureml.py プロジェクト: eganjs/dask-cloudprovider
    async def __create_cluster(self):
        self.__print_message("Setting up cluster")
        exp = Experiment(self.workspace, self.experiment_name)
        estimator = Estimator(
            os.path.join(self.abs_path, "setup"),
            compute_target=self.compute_target,
            entry_script="start_scheduler.py",
            environment_definition=self.environment_definition,
            script_params=self.scheduler_params,
            node_count=1,  ### start only scheduler
            distributed_training=MpiConfiguration(),
            use_docker=True,
            inputs=self.datastores,
        )

        run = exp.submit(estimator, tags=self.tags)

        self.__print_message("Waiting for scheduler node's IP")
        status = run.get_status()
        while (status != "Canceled" and status != "Failed"
               and "scheduler" not in run.get_metrics()):
            print(".", end="")
            logger.info("Scheduler not ready")
            time.sleep(5)
            status = run.get_status()

        if status == "Canceled" or status == "Failed":
            run_error = run.get_details().get("error")
            error_message = "Failed to start the AzureML cluster."

            if run_error:
                error_message = "{} {}".format(error_message, run_error)
            logger.exception(error_message)

            if not self.compute_target_set:
                self.__delete_compute_target()

            raise Exception(error_message)

        print("\n")

        ### SET FLAGS
        self.scheduler_ip_port = run.get_metrics()["scheduler"]
        self.worker_params["--scheduler_ip_port"] = self.scheduler_ip_port
        self.__print_message(f'Scheduler: {run.get_metrics()["scheduler"]}')
        self.run = run

        ### CHECK IF ON THE SAME VNET
        max_retry = 5
        while self.same_vnet is None and max_retry > 0:
            time.sleep(5)
            await self.sync(self.__check_if_scheduler_ip_reachable)
            max_retry -= 1

        if self.same_vnet is None:
            self.run.cancel()
            if not self.compute_target_set:
                self.__delete_compute_target()
            logger.exception(
                "Connection error after retrying. Failed to start the AzureML cluster."
            )
            return

        ### REQUIRED BY dask.distributed.deploy.cluster.Cluster
        self.hostname = socket.gethostname()
        self.is_in_ci = (
            f"/mnt/batch/tasks/shared/LS_root/mounts/clusters/{self.hostname}"
            in os.getcwd())
        _scheduler = self.__prepare_rpc_connection_to_headnode()
        self.scheduler_comm = rpc(_scheduler)
        await self.sync(self.__setup_port_forwarding)

        try:
            await super()._start()
        except Exception as e:
            logger.exception(e)
            # CLEAN UP COMPUTE TARGET
            self.run.cancel()
            if not self.compute_target_set:
                self.__delete_compute_target()
            return

        await self.sync(self.__update_links)

        self.__print_message("Connections established")
        self.__print_message(f"Scaling to {self.initial_node_count} workers")

        if self.initial_node_count > 1:
            self.scale(self.initial_node_count
                       )  # LOGIC TO KEEP PROPER TRACK OF WORKERS IN `scale`
        self.__print_message("Scaling is done")
コード例 #25
0
ファイル: run.py プロジェクト: lebedov/dask-ml-on-azure-ml
use_estimator = True
if use_estimator:
    if cv:
        script_params = {'--cv': cv}
        node_count = cv + 2  # dask-mpi uses 2 nodes for its scheduler and client
        distributed_training = MpiConfiguration()
    else:
        script_params = None
        node_count = None
        distributed_training = None
    to_run = Estimator(source_directory='.',
                       compute_target=compute_target,
                       entry_script='train.py',
                       script_params=script_params,
                       node_count=node_count,
                       use_gpu=False,
                       conda_dependencies_file='env.yml',
                       distributed_training=distributed_training)
else:
    if cv:
        arguments = ['--cv', str(cv)]
    else:
        arguments = []
    to_run = ScriptRunConfig(source_directory='.',
                             script='train.py',
                             arguments=arguments,
                             run_config=run_conf)
run = exp.submit(to_run)
run.wait_for_completion(show_output=True)
コード例 #26
0
run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
run_config.environment.environment_variables = script_env
dependencies = CondaDependencies.create(
    pip_packages=["scikit-learn", "scipy", "numpy"])
run_config.environment.python.conda_dependencies = dependencies

# Submit training
automl_config_common = {
    'task': 'forecasting',
    'primary_metric': 'normalized_root_mean_squared_error',
    'verbosity': logging.INFO,
    'time_column_name': time_column_name,
    'max_horizon': horizon,
    'iterations': 10,
    'n_cross_validations': 5,
    'enable_ensembling': True
}

automl_config = AutoMLConfig(path=script_folder,
                             data_script='get_data.py',
                             compute_target=compute_target,
                             run_configuration=run_config,
                             **automl_config_common)

exp = Experiment(workspace=ws, name=experiment_name)
run = exp.submit(automl_config, show_output=True)
best_run, fitted_model = run.get_output()
コード例 #27
0
                      environment=env)

param_sampling = RandomParameterSampling({
    "--num-topics": choice(5, 10, 15, 20)
})

# Submit experiment

hd = HyperDriveConfig(run_config=src,
                      hyperparameter_sampling=param_sampling,
                      primary_metric_name="c_v",
                      primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                      max_total_runs=100,
                      max_concurrent_runs=4)

run = exp.submit(config=hd)

run.wait_for_completion(show_output=False)

print(run.get_metrics())

print(run.get_file_names())

# Register model

best_run = run.get_best_run_by_primary_metric()

model = best_run.register_model(model_name='gensim_lda', model_path='outputs')

print(model.name, model.id, model.version, sep='\t')
コード例 #28
0
ファイル: AZML_DBSCAN.py プロジェクト: ml810326/DBSCAN_Pred
get_ipython().run_cell_magic('writefile', '$script_folder/train.py', '\nimport argparse\nimport os\nimport numpy as np\n\nfrom sklearn.cluster import DBSCAN\nfrom sklearn import metrics\nfrom sklearn.datasets.samples_generator import make_blobs\nfrom sklearn.preprocessing import StandardScaler\n\nfrom sklearn.externals import joblib\n\nfrom azureml.core import Run\nfrom utils import load_data\n\nimport subprocess\nimport sys\n\ndef install(package):\n    subprocess.call([sys.executable, "-m", "pip", "install", package])\n\ninstall(\'pandas\')\ninstall(\'azure-storage\')\ninstall(\'tables\')\n\nimport pandas as pd\n\n# let user feed in parameters, the location of the data files (from datastore),\nparser = argparse.ArgumentParser()\nparser.add_argument(\'--data-folder\', type=str, dest=\'data_folder\', help=\'data folder mounting point\')\nargs = parser.parse_args()\n\ndata_folder = args.data_folder\nprint(\'Data folder:\', data_folder)\n\n# load train and test set into numpy arrays\nData_training = pd.read_csv(os.path.join(data_folder, \'data.csv\'))\nData_training = StandardScaler().fit_transform(Data_training)\n\n# get hold of the current run\nrun = Run.get_context()\n\ndb = DBSCAN(eps=2, min_samples=10).fit(Data_training)\ncore_samples_mask = np.zeros_like(db.labels_, dtype=bool)\ncore_samples_mask[db.core_sample_indices_] = True\nlabels = db.labels_\n\nn_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)\nn_noise_ = list(labels).count(-1)\n\nprint(\'Estimated number of clusters: %d\' % n_clusters_)\nprint(\'Estimated number of noise points: %d\' % n_noise_)\n\nprint("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(Data_training, labels))\n\ndb.core_sample_indices_\ndb.components_\n\nos.makedirs(\'outputs\', exist_ok=True)\npd.DataFrame(db.components_).to_csv("outputs/model.csv", header=None, index=None)\n\nfrom azure.storage.blob import BlockBlobService\nimport tables\n\nSTORAGEACCOUNTNAME = "datatest123"\nLOCALFILENAME = "outputs/model.csv"\nSTORAGEACCOUNTKEY = "DHfLH+rw0qOUya7ihZQp5+7lA4Ezo1hdonfqsQZGw+HZ6vORqjMJpzgSQ/kxIiRDoWFEQzHI7P7xIzRlVWW08w=="\nCONTAINERNAME= "testconta"\nBLOBNAME= "model/model.csv"\n\noutput_blob_service=BlockBlobService(account_name=STORAGEACCOUNTNAME,account_key=STORAGEACCOUNTKEY)    \nlocalfileprocessed = os.path.join(os.getcwd(),LOCALFILENAME) #assuming file is in current working directory\ntry:\n    output_blob_service.create_blob_from_path(CONTAINERNAME,BLOBNAME,localfileprocessed)\nexcept:            \n    print ("Something went wrong with uploading to the blob:"+ BLOBNAME)\n\n# note file saved in the outputs folder is automatically uploaded into experiment record\n# joblib.dump(value=clf, filename=\'outputs/sklearn_mnist_model.pkl\')')

import shutil
shutil.copy('utils.py', script_folder)

from azureml.train.sklearn import SKLearn

script_params = {
    '--data-folder': ds.path('dbscndata').as_mount()
}

#establish the estimator for learning
est = SKLearn(source_directory=script_folder,
                script_params=script_params,
                compute_target=compute_target,
                entry_script='train.py')

print(ds.path('dbscndata').as_mount())

# upload the estimator
run = exp.submit(config=est)
run

# start training process
from azureml.widgets import RunDetails
RunDetails(run).show()

# register model 
model = run.register_model(model_name='dbscan', model_path='outputs/model.csv')
print(model.name, model.id, model.version, sep='\t')
コード例 #29
0
ファイル: azure_a2ml.py プロジェクト: raymanchester/a2ml
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

# get_data script does this now
csv_file = "../data/" + experiment_name + ".csv"

automl_settings = {
    "iteration_timeout_minutes": 10,
    "iterations": 30,
    "primary_metric": 'spearman_correlation',
    "preprocess": True,
    "verbosity": logging.DEBUG,
    "n_cross_validations": 5
}
dflow = dprep.read_csv

automl_config = AutoMLConfig(task='regression',
                             debug_log='automl_errors.log',
                             path=project_folder,
                             compute_target=compute_target,
                             data_script="get_data.py",
                             **automl_settings)

experiment = Experiment(ws, 'automl_remote')
remote_run = experiment.submit(automl_config, show_output=True)
コード例 #30
0
run_config = RunConfiguration()

# signal that you want to use AmlCompute to execute script.
run_config.target = "amlcompute"

# AmlCompute will be created in the same region as workspace
# Set vm size for AmlCompute
run_config.amlcompute.vm_size = 'STANDARD_D2_V2'

# enable Docker 
run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# auto-prepare the Docker image when used for execution (if it is not already prepared)
run_config.auto_prepare_environment = True

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])

# Now submit a run on AmlCompute
from azureml.core.script_run_config import ScriptRunConfig

script_run_config = ScriptRunConfig(source_directory=project_folder, script='train.py', run_config=run_config)

run = experiment.submit(script_run_config)
run.wait_for_completion()