コード例 #1
0
def _get_configs(automlconfig: AutoMLConfig,
                 data: Dataset,
                 target_column: str,
                 compute_target: ComputeTarget,
                 group_column_names: List[str]) -> Dict[str, AutoMLConfig]:
    # remove invalid characters regex
    valid_chars = re.compile('[^a-zA-Z0-9-]')
    groups = _get_groups(data, group_column_names)
    if groups.shape[0] > 40:
        raise RuntimeError("AutoML only supports 40 or less groups. Please modify your "
                           "group_column_names to ensure no more than 40 groups are present.")
    configs = {}
    for i, group in groups.iterrows():
        single = data._dataflow
        group_name = "#####".join(str(x) for x in group.values)
        group_name = valid_chars.sub('', group_name)
        for key in group.index:
            single = single.filter(data._dataflow[key] == group[key])
        t_dataset = TabularDataset._create(single)
        group_conf = copy.deepcopy(automlconfig)
        group_conf.user_settings['training_data'] = t_dataset
        group_conf.user_settings['label_column_name'] = target_column
        group_conf.user_settings['compute_target'] = compute_target
        configs[group_name] = group_conf
    return configs
コード例 #2
0
    def _register_output(self, run_name, dest, config):
        from azureml.core import Dataset

        def save_lineage(dataset, mode):
            from azureml._restclient.models import OutputDatasetLineage, DatasetIdentifier, DatasetOutputType, \
                DatasetOutputDetails, DatasetOutputMechanism
            from azureml.core import Run
            from azureml.data.constants import MOUNT_MODE

            id = dataset.id
            registered_id = dataset._registration and dataset._registration.registered_id
            version = dataset.version
            dataset_id = DatasetIdentifier(id, registered_id, version)
            output_details = DatasetOutputDetails(
                run_name, DatasetOutputMechanism.mount if mode.lower()
                == MOUNT_MODE else DatasetOutputMechanism.upload)
            output_lineage = OutputDatasetLineage(dataset_id,
                                                  DatasetOutputType.run_output,
                                                  output_details)

            try:
                run = Run.get_context()
                run._update_output_dataset_lineage([output_lineage])
            except Exception:
                module_logger.error("Failed to update output dataset lineage")

        additional_options = config["AdditionalOptions"]
        registration_options = additional_options.get(
            "RegistrationOptions") or {}
        name = registration_options.get("Name")
        description = registration_options.get("Description")
        tags = registration_options.get("Tags")
        dataset_registration = registration_options.get(
            "DatasetRegistrationOptions") or {}
        dataflow = dataset_registration.get("AdditionalTransformation")

        dataset = Dataset.File.from_files(dest, False)
        if dataflow:
            import azureml.dataprep as dprep
            from azureml.data import TabularDataset, FileDataset
            from azureml.data._dataprep_helper import is_tabular

            transformations = dprep.Dataflow.from_json(dataflow)
            combined = dprep.Dataflow(
                transformations._engine_api,
                dataset._dataflow._get_steps() + transformations._get_steps())
            dataset = TabularDataset._create(combined) if is_tabular(transformations)\
                else FileDataset._create(combined)
        if name:
            dataset = dataset._register(self._workspace, name, description,
                                        tags, True)
        else:
            dataset._ensure_saved_internal(self._workspace)

        save_lineage(dataset, config["Mechanism"])
コード例 #3
0
def _get_configs(automlconfig: AutoMLConfig, data: Dataset, target_column: str,
                 compute_target: ComputeTarget,
                 group_column_names: List[str]) -> Dict[str, AutoMLConfig]:
    # remove invalid characters regex
    valid_chars = re.compile('[^a-zA-Z0-9-]')
    groups = _get_groups(data, group_column_names)
    configs = {}
    for i, group in groups.iterrows():
        single = data
        group_name = "#####".join(str(x) for x in group.values)
        group_name = valid_chars.sub('', group_name)
        for key in group.index:
            single = single._dataflow.filter(data._dataflow[key] == group[key])
        t_dataset = TabularDataset._create(single)
        group_conf = copy.deepcopy(automlconfig)
        group_conf.user_settings['training_data'] = t_dataset
        group_conf.user_settings['label_column_name'] = target_column
        group_conf.user_settings['compute_target'] = compute_target
        configs[group_name] = group_conf
    return configs
コード例 #4
0
def get_backtest_pipeline(
    experiment: Experiment,
    dataset: TabularDataset,
    process_per_node: int,
    node_count: int,
    compute_target: ComputeTarget,
    automl_settings: Dict[str, Any],
    step_size: int,
    step_number: int,
    model_name: Optional[str] = None,
    model_uid: Optional[str] = None,
) -> Pipeline:
    """
    :param experiment: The experiment used to run the pipeline.
    :param dataset: Tabular data set to be used for model training.
    :param process_per_node: The number of processes per node. Generally it should be the number of cores
                             on the node divided by two.
    :param node_count: The number of nodes to be used.
    :param compute_target: The compute target to be used to run the pipeline.
    :param model_name: The name of a model to be back tested.
    :param automl_settings: The dictionary with automl settings.
    :param step_size: The number of periods to step back in backtesting.
    :param step_number: The number of backtesting iterations.
    :param model_uid: The uid to mark models from this run of the experiment.
    :return: The pipeline to be used for model retraining.
             **Note:** The output will be uploaded in the pipeline output
             called 'score'.
    """
    jasmine_client = JasmineClient(
        service_context=experiment.workspace.service_context,
        experiment_name=experiment.name,
        experiment_id=experiment.id,
    )
    env = jasmine_client.get_curated_environment(
        scenario=Scenarios.AUTOML,
        enable_dnn=False,
        enable_gpu=False,
        compute=compute_target,
        compute_sku=experiment.workspace.compute_targets.get(
            compute_target.name
        ).vm_size,
    )
    data_results = PipelineData(
        name="results", datastore=None, pipeline_output_name="results"
    )
    ############################################################
    # Split the data set using python script.
    ############################################################
    run_config = RunConfiguration()
    run_config.docker.use_docker = True
    run_config.environment = env

    utilities.set_environment_variables_for_run(run_config)

    split_data = PipelineData(name="split_data_output", datastore=None).as_dataset()
    split_step = PythonScriptStep(
        name="split_data_for_backtest",
        script_name="data_split.py",
        inputs=[dataset.as_named_input("training_data")],
        outputs=[split_data],
        source_directory=PROJECT_FOLDER,
        arguments=[
            "--step-size",
            step_size,
            "--step-number",
            step_number,
            "--time-column-name",
            automl_settings.get("time_column_name"),
            "--time-series-id-column-names",
            automl_settings.get("grain_column_names"),
            "--output-dir",
            split_data,
        ],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    ############################################################
    # We will do the backtest the parallel run step.
    ############################################################
    settings_path = os.path.join(PROJECT_FOLDER, SETTINGS_FILE)
    hru.dump_object_to_json(automl_settings, settings_path)
    mini_batch_size = PipelineParameter(name="batch_size_param", default_value=str(1))
    back_test_config = ParallelRunConfig(
        source_directory=PROJECT_FOLDER,
        entry_script="retrain_models.py",
        mini_batch_size=mini_batch_size,
        error_threshold=-1,
        output_action="append_row",
        append_row_file_name="outputs.txt",
        compute_target=compute_target,
        environment=env,
        process_count_per_node=process_per_node,
        run_invocation_timeout=3600,
        node_count=node_count,
    )
    utilities.set_environment_variables_for_run(back_test_config)
    forecasts = PipelineData(name="forecasts", datastore=None)
    if model_name:
        parallel_step_name = "{}-backtest".format(model_name.replace("_", "-"))
    else:
        parallel_step_name = "AutoML-backtest"

    prs_args = [
        "--target_column_name",
        automl_settings.get("label_column_name"),
        "--output-dir",
        forecasts,
    ]
    if model_name is not None:
        prs_args.append("--model-name")
        prs_args.append(model_name)
    if model_uid is not None:
        prs_args.append("--model-uid")
        prs_args.append(model_uid)
    backtest_prs = ParallelRunStep(
        name=parallel_step_name,
        parallel_run_config=back_test_config,
        arguments=prs_args,
        inputs=[split_data],
        output=forecasts,
        allow_reuse=False,
    )
    ############################################################
    # Then we collect the output and return it as scores output.
    ############################################################
    collection_step = PythonScriptStep(
        name="score",
        script_name="score.py",
        inputs=[forecasts.as_mount()],
        outputs=[data_results],
        source_directory=PROJECT_FOLDER,
        arguments=["--forecasts", forecasts, "--output-dir", data_results],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    # Build and return the pipeline.
    return Pipeline(
        workspace=experiment.workspace,
        steps=[split_step, backtest_prs, collection_step],
    )