コード例 #1
0
def create_register_model_step(model_folder: PipelineData,
                               register_model_folder: PipelineData,
                               compute: ComputeTarget,
                               debug_run: bool) -> PythonScriptStep:
    """
    Creates "Register Model" PythonScriptStep
    """
    force_param = PipelineParameter(name="force_registration",
                                    default_value="False")
    skip_param = PipelineParameter(name="skip_registration",
                                   default_value="False")

    register_step = PythonScriptStep(
        name="Register Model",
        script_name="register_model.py",
        source_directory='./safe-driver/register/',
        compute_target=compute,
        inputs=[model_folder, register_model_folder],
        arguments=[
            "--force", force_param, "--skip", skip_param, "--model-metadata",
            model_folder.as_mount(), "--register-model-folder",
            register_model_folder.as_mount()
        ],
        allow_reuse=debug_run,
        runconfig=RC)

    return register_step
コード例 #2
0
def create_evaluate_model_step(
        model_metadata_folder: PipelineData, compute: ComputeTarget,
        validation_data: Dataset,
        debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]:
    """
    Creates "Evaluate Model" Step
    """
    output_folder = "./outputs"
    output_data = PipelineData(name="RegisterModel",
                               datastore=WS.get_default_datastore(),
                               is_directory=True,
                               output_path_on_compute=output_folder,
                               output_mode="upload")

    eval_step = PythonScriptStep(
        name="Evaluate Model",
        script_name="evaluate.py",
        source_directory='./safe-driver/evaluate/',
        compute_target=compute,
        inputs=[model_metadata_folder],
        outputs=[output_data],
        arguments=[
            "--model-metadata",
            model_metadata_folder.as_mount(), "--register-model-folder",
            output_folder, "--validation-data",
            validation_data.as_named_input("ValidationData").as_mount()
        ],
        allow_reuse=debug_run,
        runconfig=RC)

    return eval_step, output_data
コード例 #3
0
def create_train_model_step(
        input_data: PipelineData, compute: ComputeTarget,
        debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]:
    output_folder = "./outputs"
    output_data = PipelineData(name="ModelMetadata",
                               datastore=WS.get_default_datastore(),
                               is_directory=True,
                               output_path_on_compute=output_folder,
                               output_mode="upload")

    train_step = PythonScriptStep(name="Train Model",
                                  script_name="train.py",
                                  source_directory='./safe-driver/train/',
                                  compute_target=compute,
                                  inputs=[input_data],
                                  outputs=[output_data],
                                  allow_reuse=debug_run,
                                  arguments=[
                                      "--output-folder", output_folder,
                                      "--training-data",
                                      input_data.as_mount()
                                  ],
                                  runconfig=RC)

    return train_step, output_data
コード例 #4
0
        "--seed",
        seed_param,
        "--gpus",
        num_gpus_param,
        "--num_workers",
        num_workers_param,
        "--train_batch_size",
        train_batch_size_param,
        "--eval_batch_size",
        eval_batch_size_param,
        "--output_dir",
        "./outputs",
        "--do_train",
        "--do_predict",
    ],
    inputs=[prepared_dataset.as_mount()],
    compute_target=compute_target,
)

step_sequence = StepSequence(steps=[prepare_step, train_step])
pipeline = Pipeline(workspace, steps=step_sequence)

# Submit single experiment run
run = experiment.submit(pipeline)

# Run the three listed models over 5 random seeds. (15 experiment runs total)
# for seed in range(5):
#     for model in ["distilbert-base-cased", "bert-base-cased", "albert-base-v2"]:
#         run = experiment.submit(
#             pipeline,
#             pipeline_parameters={
コード例 #5
0
def get_backtest_pipeline(
    experiment: Experiment,
    dataset: TabularDataset,
    process_per_node: int,
    node_count: int,
    compute_target: ComputeTarget,
    automl_settings: Dict[str, Any],
    step_size: int,
    step_number: int,
    model_name: Optional[str] = None,
    model_uid: Optional[str] = None,
) -> Pipeline:
    """
    :param experiment: The experiment used to run the pipeline.
    :param dataset: Tabular data set to be used for model training.
    :param process_per_node: The number of processes per node. Generally it should be the number of cores
                             on the node divided by two.
    :param node_count: The number of nodes to be used.
    :param compute_target: The compute target to be used to run the pipeline.
    :param model_name: The name of a model to be back tested.
    :param automl_settings: The dictionary with automl settings.
    :param step_size: The number of periods to step back in backtesting.
    :param step_number: The number of backtesting iterations.
    :param model_uid: The uid to mark models from this run of the experiment.
    :return: The pipeline to be used for model retraining.
             **Note:** The output will be uploaded in the pipeline output
             called 'score'.
    """
    jasmine_client = JasmineClient(
        service_context=experiment.workspace.service_context,
        experiment_name=experiment.name,
        experiment_id=experiment.id,
    )
    env = jasmine_client.get_curated_environment(
        scenario=Scenarios.AUTOML,
        enable_dnn=False,
        enable_gpu=False,
        compute=compute_target,
        compute_sku=experiment.workspace.compute_targets.get(
            compute_target.name
        ).vm_size,
    )
    data_results = PipelineData(
        name="results", datastore=None, pipeline_output_name="results"
    )
    ############################################################
    # Split the data set using python script.
    ############################################################
    run_config = RunConfiguration()
    run_config.docker.use_docker = True
    run_config.environment = env

    utilities.set_environment_variables_for_run(run_config)

    split_data = PipelineData(name="split_data_output", datastore=None).as_dataset()
    split_step = PythonScriptStep(
        name="split_data_for_backtest",
        script_name="data_split.py",
        inputs=[dataset.as_named_input("training_data")],
        outputs=[split_data],
        source_directory=PROJECT_FOLDER,
        arguments=[
            "--step-size",
            step_size,
            "--step-number",
            step_number,
            "--time-column-name",
            automl_settings.get("time_column_name"),
            "--time-series-id-column-names",
            automl_settings.get("grain_column_names"),
            "--output-dir",
            split_data,
        ],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    ############################################################
    # We will do the backtest the parallel run step.
    ############################################################
    settings_path = os.path.join(PROJECT_FOLDER, SETTINGS_FILE)
    hru.dump_object_to_json(automl_settings, settings_path)
    mini_batch_size = PipelineParameter(name="batch_size_param", default_value=str(1))
    back_test_config = ParallelRunConfig(
        source_directory=PROJECT_FOLDER,
        entry_script="retrain_models.py",
        mini_batch_size=mini_batch_size,
        error_threshold=-1,
        output_action="append_row",
        append_row_file_name="outputs.txt",
        compute_target=compute_target,
        environment=env,
        process_count_per_node=process_per_node,
        run_invocation_timeout=3600,
        node_count=node_count,
    )
    utilities.set_environment_variables_for_run(back_test_config)
    forecasts = PipelineData(name="forecasts", datastore=None)
    if model_name:
        parallel_step_name = "{}-backtest".format(model_name.replace("_", "-"))
    else:
        parallel_step_name = "AutoML-backtest"

    prs_args = [
        "--target_column_name",
        automl_settings.get("label_column_name"),
        "--output-dir",
        forecasts,
    ]
    if model_name is not None:
        prs_args.append("--model-name")
        prs_args.append(model_name)
    if model_uid is not None:
        prs_args.append("--model-uid")
        prs_args.append(model_uid)
    backtest_prs = ParallelRunStep(
        name=parallel_step_name,
        parallel_run_config=back_test_config,
        arguments=prs_args,
        inputs=[split_data],
        output=forecasts,
        allow_reuse=False,
    )
    ############################################################
    # Then we collect the output and return it as scores output.
    ############################################################
    collection_step = PythonScriptStep(
        name="score",
        script_name="score.py",
        inputs=[forecasts.as_mount()],
        outputs=[data_results],
        source_directory=PROJECT_FOLDER,
        arguments=["--forecasts", forecasts, "--output-dir", data_results],
        runconfig=run_config,
        compute_target=compute_target,
        allow_reuse=False,
    )
    # Build and return the pipeline.
    return Pipeline(
        workspace=experiment.workspace,
        steps=[split_step, backtest_prs, collection_step],
    )