def _get_configs(automlconfig: AutoMLConfig, data: Dataset, target_column: str, compute_target: ComputeTarget, group_column_names: List[str]) -> Dict[str, AutoMLConfig]: # remove invalid characters regex valid_chars = re.compile('[^a-zA-Z0-9-]') groups = _get_groups(data, group_column_names) if groups.shape[0] > 40: raise RuntimeError("AutoML only supports 40 or less groups. Please modify your " "group_column_names to ensure no more than 40 groups are present.") configs = {} for i, group in groups.iterrows(): single = data._dataflow group_name = "#####".join(str(x) for x in group.values) group_name = valid_chars.sub('', group_name) for key in group.index: single = single.filter(data._dataflow[key] == group[key]) t_dataset = TabularDataset._create(single) group_conf = copy.deepcopy(automlconfig) group_conf.user_settings['training_data'] = t_dataset group_conf.user_settings['label_column_name'] = target_column group_conf.user_settings['compute_target'] = compute_target configs[group_name] = group_conf return configs
def _register_output(self, run_name, dest, config): from azureml.core import Dataset def save_lineage(dataset, mode): from azureml._restclient.models import OutputDatasetLineage, DatasetIdentifier, DatasetOutputType, \ DatasetOutputDetails, DatasetOutputMechanism from azureml.core import Run from azureml.data.constants import MOUNT_MODE id = dataset.id registered_id = dataset._registration and dataset._registration.registered_id version = dataset.version dataset_id = DatasetIdentifier(id, registered_id, version) output_details = DatasetOutputDetails( run_name, DatasetOutputMechanism.mount if mode.lower() == MOUNT_MODE else DatasetOutputMechanism.upload) output_lineage = OutputDatasetLineage(dataset_id, DatasetOutputType.run_output, output_details) try: run = Run.get_context() run._update_output_dataset_lineage([output_lineage]) except Exception: module_logger.error("Failed to update output dataset lineage") additional_options = config["AdditionalOptions"] registration_options = additional_options.get( "RegistrationOptions") or {} name = registration_options.get("Name") description = registration_options.get("Description") tags = registration_options.get("Tags") dataset_registration = registration_options.get( "DatasetRegistrationOptions") or {} dataflow = dataset_registration.get("AdditionalTransformation") dataset = Dataset.File.from_files(dest, False) if dataflow: import azureml.dataprep as dprep from azureml.data import TabularDataset, FileDataset from azureml.data._dataprep_helper import is_tabular transformations = dprep.Dataflow.from_json(dataflow) combined = dprep.Dataflow( transformations._engine_api, dataset._dataflow._get_steps() + transformations._get_steps()) dataset = TabularDataset._create(combined) if is_tabular(transformations)\ else FileDataset._create(combined) if name: dataset = dataset._register(self._workspace, name, description, tags, True) else: dataset._ensure_saved_internal(self._workspace) save_lineage(dataset, config["Mechanism"])
def _get_configs(automlconfig: AutoMLConfig, data: Dataset, target_column: str, compute_target: ComputeTarget, group_column_names: List[str]) -> Dict[str, AutoMLConfig]: # remove invalid characters regex valid_chars = re.compile('[^a-zA-Z0-9-]') groups = _get_groups(data, group_column_names) configs = {} for i, group in groups.iterrows(): single = data group_name = "#####".join(str(x) for x in group.values) group_name = valid_chars.sub('', group_name) for key in group.index: single = single._dataflow.filter(data._dataflow[key] == group[key]) t_dataset = TabularDataset._create(single) group_conf = copy.deepcopy(automlconfig) group_conf.user_settings['training_data'] = t_dataset group_conf.user_settings['label_column_name'] = target_column group_conf.user_settings['compute_target'] = compute_target configs[group_name] = group_conf return configs
def get_backtest_pipeline( experiment: Experiment, dataset: TabularDataset, process_per_node: int, node_count: int, compute_target: ComputeTarget, automl_settings: Dict[str, Any], step_size: int, step_number: int, model_name: Optional[str] = None, model_uid: Optional[str] = None, ) -> Pipeline: """ :param experiment: The experiment used to run the pipeline. :param dataset: Tabular data set to be used for model training. :param process_per_node: The number of processes per node. Generally it should be the number of cores on the node divided by two. :param node_count: The number of nodes to be used. :param compute_target: The compute target to be used to run the pipeline. :param model_name: The name of a model to be back tested. :param automl_settings: The dictionary with automl settings. :param step_size: The number of periods to step back in backtesting. :param step_number: The number of backtesting iterations. :param model_uid: The uid to mark models from this run of the experiment. :return: The pipeline to be used for model retraining. **Note:** The output will be uploaded in the pipeline output called 'score'. """ jasmine_client = JasmineClient( service_context=experiment.workspace.service_context, experiment_name=experiment.name, experiment_id=experiment.id, ) env = jasmine_client.get_curated_environment( scenario=Scenarios.AUTOML, enable_dnn=False, enable_gpu=False, compute=compute_target, compute_sku=experiment.workspace.compute_targets.get( compute_target.name ).vm_size, ) data_results = PipelineData( name="results", datastore=None, pipeline_output_name="results" ) ############################################################ # Split the data set using python script. ############################################################ run_config = RunConfiguration() run_config.docker.use_docker = True run_config.environment = env utilities.set_environment_variables_for_run(run_config) split_data = PipelineData(name="split_data_output", datastore=None).as_dataset() split_step = PythonScriptStep( name="split_data_for_backtest", script_name="data_split.py", inputs=[dataset.as_named_input("training_data")], outputs=[split_data], source_directory=PROJECT_FOLDER, arguments=[ "--step-size", step_size, "--step-number", step_number, "--time-column-name", automl_settings.get("time_column_name"), "--time-series-id-column-names", automl_settings.get("grain_column_names"), "--output-dir", split_data, ], runconfig=run_config, compute_target=compute_target, allow_reuse=False, ) ############################################################ # We will do the backtest the parallel run step. ############################################################ settings_path = os.path.join(PROJECT_FOLDER, SETTINGS_FILE) hru.dump_object_to_json(automl_settings, settings_path) mini_batch_size = PipelineParameter(name="batch_size_param", default_value=str(1)) back_test_config = ParallelRunConfig( source_directory=PROJECT_FOLDER, entry_script="retrain_models.py", mini_batch_size=mini_batch_size, error_threshold=-1, output_action="append_row", append_row_file_name="outputs.txt", compute_target=compute_target, environment=env, process_count_per_node=process_per_node, run_invocation_timeout=3600, node_count=node_count, ) utilities.set_environment_variables_for_run(back_test_config) forecasts = PipelineData(name="forecasts", datastore=None) if model_name: parallel_step_name = "{}-backtest".format(model_name.replace("_", "-")) else: parallel_step_name = "AutoML-backtest" prs_args = [ "--target_column_name", automl_settings.get("label_column_name"), "--output-dir", forecasts, ] if model_name is not None: prs_args.append("--model-name") prs_args.append(model_name) if model_uid is not None: prs_args.append("--model-uid") prs_args.append(model_uid) backtest_prs = ParallelRunStep( name=parallel_step_name, parallel_run_config=back_test_config, arguments=prs_args, inputs=[split_data], output=forecasts, allow_reuse=False, ) ############################################################ # Then we collect the output and return it as scores output. ############################################################ collection_step = PythonScriptStep( name="score", script_name="score.py", inputs=[forecasts.as_mount()], outputs=[data_results], source_directory=PROJECT_FOLDER, arguments=["--forecasts", forecasts, "--output-dir", data_results], runconfig=run_config, compute_target=compute_target, allow_reuse=False, ) # Build and return the pipeline. return Pipeline( workspace=experiment.workspace, steps=[split_step, backtest_prs, collection_step], )