def get_df_from_dataset(dataset_path, dataset_name, dataset_is_remote=False): """ Return a DataFrame by reading the dataset from either a directory containing csv files or Azure Dataset """ if dataset_is_remote: workspace = package_utils.get_workspace() df = Dataset.get_by_name(workspace=workspace, name=dataset_name).to_pandas_dataframe() else: df = get_df_from_directory(pathlib.Path(dataset_path, dataset_name)) return df
def main(model_name="deploy", model_version=None, deployment_name="deploy"): """ Return a AciWebservice deploy config """ environment = get_environment( name=deployment_name, file_path="nd00333/model/deploy/environment.yml", ) logger.info(msg="main", extra={"environment": environment}) inference_config = InferenceConfig( source_directory="nd00333", entry_script="model/deploy/score.py", environment=environment, ) logger.info(msg="main", extra={"inference_config": inference_config}) workspace = package_utils.get_workspace() deployment_config = AciWebservice.deploy_configuration( cpu_cores=1.0, memory_gb=8.0, auth_enabled=True, enable_app_insights=True, collect_model_data=False, ) logger.info(msg="main", extra={"deployment_config": deployment_config}) model = Model(workspace, name=model_name, version=model_version) logger.info(msg="main", extra={"model": model}) service = Model.deploy( workspace, deployment_name, [model], inference_config, deployment_config, overwrite=True, ) logger.info(msg="main", extra={"service": service}) return service
def dataset_register_file(args): """ Register a file dataset into the workspace """ workspace = package_utils.get_workspace() datastore_path, target_path = datastore_upload_files(args) logger.info(msg="Dataset.File.from_files", extra={"datastore_path": datastore_path}) if not args.dry_run: file_dataset = Dataset.File.from_files(path=datastore_path) kwargs = { "workspace": workspace, "name": target_path, "create_new_version": False } logger.info(msg="file_dataset.register", extra={"kwargs": kwargs}) if not args.dry_run: _ = file_dataset.register(**kwargs)
def datastore_upload_files(args): """ Get the default datastore and upload files into it """ workspace = package_utils.get_workspace() datastore = package_utils.get_default_datastore(workspace) directory = pathlib.Path(args.dataset_path, args.dataset_name) if not os.path.exists(directory): msg = f"The dataset directory {directory} does not exist" logger.exception(msg) raise RuntimeError(msg) files = [ os.path.abspath(file) for file in sorted(glob.glob(f"{directory}/*.csv")) ] target_path = f"{args.dataset_name}_{args.dataset_version}" kwargs = { "files": files, "target_path": target_path, "overwrite": args.dataset_overwrite, } logger.info(msg="datastore.upload_files", extra={"kwargs": kwargs}) if not args.dry_run: try: _ = upload_files(datastore, **kwargs) except: msg = f"Upload to target_path {target_path} failed" logger.exception(msg) raise RuntimeError(msg) datastore_path = [ DataPath(datastore, str(pathlib.Path(target_path, os.path.basename(file)))) for file in files ] return datastore_path, target_path
def dataset_register_tabular(args): """ Register a tabular dataset into the workspace """ workspace = package_utils.get_workspace() datastore_path, target_path = datastore_upload_files(args) kwargs = {"path": datastore_path, "set_column_types": DATA_TYPES} logger.info(msg="TabularDatasetFactory.from_delimited_files", extra={"kwargs": kwargs}) if not args.dry_run: tabular = TabularDatasetFactory.from_delimited_files(**kwargs) kwargs = { "workspace": workspace, "name": target_path, "create_new_version": False } logger.info(msg="tabular.register", extra={"kwargs": kwargs}) if not args.dry_run: _ = tabular.register(**kwargs)
def main(args): """ Main """ workspace = package_utils.get_workspace() if args.cluster_name is None: cluster_name = package_utils.trim_cluster_name(workspace.name) else: cluster_name = args.cluster_name try: compute_target = AmlCompute(workspace=workspace, name=cluster_name) logger.info(msg=f"Found existing cluster {cluster_name}") except ComputeTargetException: compute_config = AmlCompute.provisioning_configuration( vm_size=args.cluster_sku, vm_priority=args.cluster_vm_priority, idle_seconds_before_scaledown=40 * 60, min_nodes=args.cluster_min_nodes, max_nodes=args.cluster_max_nodes, ) compute_target_create = ComputeTarget.create(workspace, cluster_name, compute_config) compute_target_create.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=5) logger.info( msg="main", extra={"status": compute_target_create.get_status().serialize()}) compute_target = workspace.compute_targets[cluster_name] logger.info(msg="main", extra={"compute_target": compute_target.serialize()}) return compute_target
def main( workspace=None, dataset_trainandvalidate_name=config.get_default_dataset_name( "trainandvalidate"), ): """ Return AutoMLConfig """ if not workspace: workspace = package_utils.get_workspace() args = aml_compute.parse_args() cluster_max_nodes = 5 args.cluster_max_nodes = cluster_max_nodes args.cluster_sku = "Standard_D12_v2" compute_target = aml_compute.main(args) logger.info(msg="main", extra={"compute_target": compute_target.serialize()}) trainandvalidate = Dataset.get_by_name( workspace=workspace, name=dataset_trainandvalidate_name, ) model_settings = { "task": "classification", "primary_metric": "norm_macro_recall", } ensemble_settings = { "iterations": 15, "allowed_models": ["LightGBM", "LogisticRegression", "SGD", "XGBoostClassifier"], "enable_voting_ensemble": True, "enable_stack_ensemble": False, } dataset_settings = { "validation_size": 0.3, "featurization": "auto", "training_data": trainandvalidate, "label_column_name": "Label", } compute_settings = { "compute_target": compute_target, "max_cores_per_iteration": -1, "max_concurrent_iterations": cluster_max_nodes, "experiment_timeout_hours": 1.5, } automl_config = AutoMLConfig( **model_settings, **ensemble_settings, **dataset_settings, **compute_settings, ) return automl_config
def main( workspace=None, dataset_train_name=config.get_default_dataset_name("train"), dataset_validate_name=config.get_default_dataset_name("validate"), ): """ Return HyperDriveConfig """ if not workspace: workspace = package_utils.get_workspace() cluster_max_nodes = 4 args = aml_compute.parse_args() args.cluster_max_nodes = cluster_max_nodes args.cluster_sku = "Standard_D2s_v3" args.cluster_vm_priority = "dedicated" compute_target = aml_compute.main(args) logger.info(msg="main", extra={"compute_target": compute_target.serialize()}) environment = get_environment() logger.info(msg="main", extra={"environment": environment}) run_config = RunConfiguration() run_config.target = compute_target run_config.environment = environment logger.info(msg="main", extra={"run_config": run_config}) parameter_space = { "--hyperparameter-n_estimators": choice(range(15, 20 + 1, 5)), "--hyperparameter-criterion": choice(["gini", "entropy"]), "--hyperparameter-max_depth": choice(range(10, 15 + 1, 5)), } hyperparameter_sampling = GridParameterSampling(parameter_space) hyperparameter_sampling_number_of_runs = functools.reduce( operator.mul, [len(value[1][0]) for value in parameter_space.values()]) train = Dataset.get_by_name( workspace=workspace, name=dataset_train_name, ) validate = Dataset.get_by_name( workspace=workspace, name=dataset_validate_name, ) arguments = [ "--dataset-train-path", train.as_named_input("train").as_mount(), "--dataset-validate-path", validate.as_named_input("validate").as_mount(), "--hyperparameter-n_jobs", -1, "--hyperparameter-random_state", 0, ] script_run_config = ScriptRunConfig( source_directory="nd00333/model/hyperdrive/train", script="train.py", arguments=arguments, run_config=run_config, compute_target=compute_target, environment=environment, max_run_duration_seconds=60 * 10, ) # The GridParameterSampling is not an iterative process # and it won't profit from a termination policy. # On the contrary, a highly accurate randomly sampled model may follow an inaccurate model. # Therefore a sampling policy that won't terminate any runs is used. policy = BanditPolicy(evaluation_interval=1, slack_factor=None, slack_amount=1.0, delay_evaluation=0) hd_config = HyperDriveConfig( hyperparameter_sampling=hyperparameter_sampling, primary_metric_name="norm_macro_recall", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=hyperparameter_sampling_number_of_runs, max_concurrent_runs=cluster_max_nodes, policy=policy, run_config=script_run_config, ) return hd_config