コード例 #1
0
def get_df_from_dataset(dataset_path, dataset_name, dataset_is_remote=False):
    """
    Return a DataFrame by reading the dataset from either a
    directory containing csv files or Azure Dataset
    """
    if dataset_is_remote:
        workspace = package_utils.get_workspace()
        df = Dataset.get_by_name(workspace=workspace,
                                 name=dataset_name).to_pandas_dataframe()
    else:
        df = get_df_from_directory(pathlib.Path(dataset_path, dataset_name))
    return df
コード例 #2
0
def main(model_name="deploy", model_version=None, deployment_name="deploy"):
    """
    Return a AciWebservice deploy config
    """
    environment = get_environment(
        name=deployment_name,
        file_path="nd00333/model/deploy/environment.yml",
    )
    logger.info(msg="main", extra={"environment": environment})

    inference_config = InferenceConfig(
        source_directory="nd00333",
        entry_script="model/deploy/score.py",
        environment=environment,
    )
    logger.info(msg="main", extra={"inference_config": inference_config})

    workspace = package_utils.get_workspace()

    deployment_config = AciWebservice.deploy_configuration(
        cpu_cores=1.0,
        memory_gb=8.0,
        auth_enabled=True,
        enable_app_insights=True,
        collect_model_data=False,
    )
    logger.info(msg="main", extra={"deployment_config": deployment_config})

    model = Model(workspace, name=model_name, version=model_version)
    logger.info(msg="main", extra={"model": model})

    service = Model.deploy(
        workspace,
        deployment_name,
        [model],
        inference_config,
        deployment_config,
        overwrite=True,
    )
    logger.info(msg="main", extra={"service": service})

    return service
コード例 #3
0
def dataset_register_file(args):
    """
    Register a file dataset into the workspace
    """
    workspace = package_utils.get_workspace()

    datastore_path, target_path = datastore_upload_files(args)

    logger.info(msg="Dataset.File.from_files",
                extra={"datastore_path": datastore_path})
    if not args.dry_run:
        file_dataset = Dataset.File.from_files(path=datastore_path)

    kwargs = {
        "workspace": workspace,
        "name": target_path,
        "create_new_version": False
    }
    logger.info(msg="file_dataset.register", extra={"kwargs": kwargs})
    if not args.dry_run:
        _ = file_dataset.register(**kwargs)
コード例 #4
0
def datastore_upload_files(args):
    """
    Get the default datastore and upload files into it
    """
    workspace = package_utils.get_workspace()
    datastore = package_utils.get_default_datastore(workspace)

    directory = pathlib.Path(args.dataset_path, args.dataset_name)
    if not os.path.exists(directory):
        msg = f"The dataset directory {directory} does not exist"
        logger.exception(msg)
        raise RuntimeError(msg)

    files = [
        os.path.abspath(file)
        for file in sorted(glob.glob(f"{directory}/*.csv"))
    ]
    target_path = f"{args.dataset_name}_{args.dataset_version}"
    kwargs = {
        "files": files,
        "target_path": target_path,
        "overwrite": args.dataset_overwrite,
    }
    logger.info(msg="datastore.upload_files", extra={"kwargs": kwargs})
    if not args.dry_run:
        try:
            _ = upload_files(datastore, **kwargs)
        except:
            msg = f"Upload to target_path {target_path} failed"
            logger.exception(msg)
            raise RuntimeError(msg)

    datastore_path = [
        DataPath(datastore,
                 str(pathlib.Path(target_path, os.path.basename(file))))
        for file in files
    ]

    return datastore_path, target_path
コード例 #5
0
def dataset_register_tabular(args):
    """
    Register a tabular dataset into the workspace
    """
    workspace = package_utils.get_workspace()

    datastore_path, target_path = datastore_upload_files(args)

    kwargs = {"path": datastore_path, "set_column_types": DATA_TYPES}
    logger.info(msg="TabularDatasetFactory.from_delimited_files",
                extra={"kwargs": kwargs})
    if not args.dry_run:
        tabular = TabularDatasetFactory.from_delimited_files(**kwargs)

    kwargs = {
        "workspace": workspace,
        "name": target_path,
        "create_new_version": False
    }
    logger.info(msg="tabular.register", extra={"kwargs": kwargs})
    if not args.dry_run:
        _ = tabular.register(**kwargs)
コード例 #6
0
def main(args):
    """
    Main
    """

    workspace = package_utils.get_workspace()
    if args.cluster_name is None:
        cluster_name = package_utils.trim_cluster_name(workspace.name)
    else:
        cluster_name = args.cluster_name

    try:
        compute_target = AmlCompute(workspace=workspace, name=cluster_name)
        logger.info(msg=f"Found existing cluster {cluster_name}")
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=args.cluster_sku,
            vm_priority=args.cluster_vm_priority,
            idle_seconds_before_scaledown=40 * 60,
            min_nodes=args.cluster_min_nodes,
            max_nodes=args.cluster_max_nodes,
        )
        compute_target_create = ComputeTarget.create(workspace, cluster_name,
                                                     compute_config)

        compute_target_create.wait_for_completion(show_output=True,
                                                  min_node_count=None,
                                                  timeout_in_minutes=5)
        logger.info(
            msg="main",
            extra={"status": compute_target_create.get_status().serialize()})

    compute_target = workspace.compute_targets[cluster_name]
    logger.info(msg="main",
                extra={"compute_target": compute_target.serialize()})

    return compute_target
コード例 #7
0
def main(
    workspace=None,
    dataset_trainandvalidate_name=config.get_default_dataset_name(
        "trainandvalidate"),
):
    """
    Return AutoMLConfig
    """

    if not workspace:
        workspace = package_utils.get_workspace()

    args = aml_compute.parse_args()
    cluster_max_nodes = 5
    args.cluster_max_nodes = cluster_max_nodes
    args.cluster_sku = "Standard_D12_v2"
    compute_target = aml_compute.main(args)
    logger.info(msg="main",
                extra={"compute_target": compute_target.serialize()})

    trainandvalidate = Dataset.get_by_name(
        workspace=workspace,
        name=dataset_trainandvalidate_name,
    )

    model_settings = {
        "task": "classification",
        "primary_metric": "norm_macro_recall",
    }

    ensemble_settings = {
        "iterations":
        15,
        "allowed_models":
        ["LightGBM", "LogisticRegression", "SGD", "XGBoostClassifier"],
        "enable_voting_ensemble":
        True,
        "enable_stack_ensemble":
        False,
    }

    dataset_settings = {
        "validation_size": 0.3,
        "featurization": "auto",
        "training_data": trainandvalidate,
        "label_column_name": "Label",
    }

    compute_settings = {
        "compute_target": compute_target,
        "max_cores_per_iteration": -1,
        "max_concurrent_iterations": cluster_max_nodes,
        "experiment_timeout_hours": 1.5,
    }

    automl_config = AutoMLConfig(
        **model_settings,
        **ensemble_settings,
        **dataset_settings,
        **compute_settings,
    )

    return automl_config
コード例 #8
0
def main(
        workspace=None,
        dataset_train_name=config.get_default_dataset_name("train"),
        dataset_validate_name=config.get_default_dataset_name("validate"),
):
    """
    Return HyperDriveConfig
    """
    if not workspace:
        workspace = package_utils.get_workspace()

    cluster_max_nodes = 4
    args = aml_compute.parse_args()
    args.cluster_max_nodes = cluster_max_nodes
    args.cluster_sku = "Standard_D2s_v3"
    args.cluster_vm_priority = "dedicated"
    compute_target = aml_compute.main(args)
    logger.info(msg="main",
                extra={"compute_target": compute_target.serialize()})

    environment = get_environment()
    logger.info(msg="main", extra={"environment": environment})

    run_config = RunConfiguration()
    run_config.target = compute_target
    run_config.environment = environment
    logger.info(msg="main", extra={"run_config": run_config})

    parameter_space = {
        "--hyperparameter-n_estimators": choice(range(15, 20 + 1, 5)),
        "--hyperparameter-criterion": choice(["gini", "entropy"]),
        "--hyperparameter-max_depth": choice(range(10, 15 + 1, 5)),
    }
    hyperparameter_sampling = GridParameterSampling(parameter_space)
    hyperparameter_sampling_number_of_runs = functools.reduce(
        operator.mul, [len(value[1][0]) for value in parameter_space.values()])

    train = Dataset.get_by_name(
        workspace=workspace,
        name=dataset_train_name,
    )
    validate = Dataset.get_by_name(
        workspace=workspace,
        name=dataset_validate_name,
    )

    arguments = [
        "--dataset-train-path",
        train.as_named_input("train").as_mount(),
        "--dataset-validate-path",
        validate.as_named_input("validate").as_mount(),
        "--hyperparameter-n_jobs",
        -1,
        "--hyperparameter-random_state",
        0,
    ]

    script_run_config = ScriptRunConfig(
        source_directory="nd00333/model/hyperdrive/train",
        script="train.py",
        arguments=arguments,
        run_config=run_config,
        compute_target=compute_target,
        environment=environment,
        max_run_duration_seconds=60 * 10,
    )

    # The GridParameterSampling is not an iterative process
    # and it won't profit from a termination policy.
    # On the contrary, a highly accurate randomly sampled model may follow an inaccurate model.
    # Therefore a sampling policy that won't terminate any runs is used.
    policy = BanditPolicy(evaluation_interval=1,
                          slack_factor=None,
                          slack_amount=1.0,
                          delay_evaluation=0)

    hd_config = HyperDriveConfig(
        hyperparameter_sampling=hyperparameter_sampling,
        primary_metric_name="norm_macro_recall",
        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=hyperparameter_sampling_number_of_runs,
        max_concurrent_runs=cluster_max_nodes,
        policy=policy,
        run_config=script_run_config,
    )

    return hd_config