def create_register_model_step(model_folder: PipelineData, register_model_folder: PipelineData, compute: ComputeTarget, debug_run: bool) -> PythonScriptStep: """ Creates "Register Model" PythonScriptStep """ force_param = PipelineParameter(name="force_registration", default_value="False") skip_param = PipelineParameter(name="skip_registration", default_value="False") register_step = PythonScriptStep( name="Register Model", script_name="register_model.py", source_directory='./safe-driver/register/', compute_target=compute, inputs=[model_folder, register_model_folder], arguments=[ "--force", force_param, "--skip", skip_param, "--model-metadata", model_folder.as_mount(), "--register-model-folder", register_model_folder.as_mount() ], allow_reuse=debug_run, runconfig=RC) return register_step
def create_train_model_step( input_data: PipelineData, compute: ComputeTarget, debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]: output_folder = "./outputs" output_data = PipelineData(name="ModelMetadata", datastore=WS.get_default_datastore(), is_directory=True, output_path_on_compute=output_folder, output_mode="upload") train_step = PythonScriptStep(name="Train Model", script_name="train.py", source_directory='./safe-driver/train/', compute_target=compute, inputs=[input_data], outputs=[output_data], allow_reuse=debug_run, arguments=[ "--output-folder", output_folder, "--training-data", input_data.as_mount() ], runconfig=RC) return train_step, output_data
def create_evaluate_model_step( model_metadata_folder: PipelineData, compute: ComputeTarget, validation_data: Dataset, debug_run: bool) -> Tuple[PythonScriptStep, PipelineData]: """ Creates "Evaluate Model" Step """ output_folder = "./outputs" output_data = PipelineData(name="RegisterModel", datastore=WS.get_default_datastore(), is_directory=True, output_path_on_compute=output_folder, output_mode="upload") eval_step = PythonScriptStep( name="Evaluate Model", script_name="evaluate.py", source_directory='./safe-driver/evaluate/', compute_target=compute, inputs=[model_metadata_folder], outputs=[output_data], arguments=[ "--model-metadata", model_metadata_folder.as_mount(), "--register-model-folder", output_folder, "--validation-data", validation_data.as_named_input("ValidationData").as_mount() ], allow_reuse=debug_run, runconfig=RC) return eval_step, output_data
def get_output_location( ws: Workspace, env: Env, outputdatastore: Datastore = None ) -> PipelineData: """ Returns a Datastore wrapped as a PipelineData instance suitable for passing into a pipeline step. Represents the location where the scoring output should be written. Uses the default workspace blob store if no output datastore is supplied. :param ws: AML Workspace :param env: Environment Variables :param outputdatastore: AML Datastore, optional, default is None :returns: PipelineData wrapping the output datastore """ if outputdatastore is None: output_loc = PipelineData( name="defaultoutput", datastore=ws.get_default_datastore() ) else: output_loc = PipelineData( name=outputdatastore.name, datastore=outputdatastore ) # NOQA: E501 return output_loc
def get_output_location(workspace, output_datastore=None): if output_datastore is not None: output_loc = PipelineData(name=output_datastore.name, datastore=output_datastore) else: output_loc = PipelineData(name='output_loc', datastore=workspace.get_default_datastore()) return output_loc
def data_ingestion_step(datastore_reference, compute_target): run_config = RunConfiguration() run_config.environment.docker.enabled = True raw_data_dir = PipelineData( name='raw_data_dir', pipeline_output_name='raw_data_dir', datastore=datastore_reference.datastore, output_mode='mount', is_directory=True) outputs = [raw_data_dir] outputs_map = { 'raw_data_dir': raw_data_dir } step = PythonScriptStep( script_name='data_ingestion.py', arguments=['--output_dir', raw_data_dir, ], inputs=[datastore_reference], outputs=outputs, compute_target=compute_target, source_directory=os.path.dirname(os.path.abspath(__file__)), runconfig=run_config, allow_reuse=True ) return step, outputs_map
def train_step(train_dir, compute_target): max_depth = PipelineParameter(name='max_depth', default_value=5) n_estimators = PipelineParameter(name='n_estimators', default_value=500) model_dir = PipelineData(name='model_dir', pipeline_output_name='model_dir', datastore=train_dir.datastore, output_mode='mount', is_directory=True) outputs = [model_dir] outputs_map = {'model_dir': model_dir} estimator = SKLearn(source_directory=os.path.dirname( os.path.abspath(__file__)), entry_script='train.py', compute_target=compute_target) step = EstimatorStep(estimator=estimator, estimator_entry_script_arguments=[ '--train_dir', train_dir, '--output_dir', model_dir, '--max_depth', max_depth, '--n_estimators', n_estimators ], inputs=[train_dir], compute_target=compute_target, outputs=outputs, allow_reuse=False) return step, outputs_map
def deploy_step(model_dir, accuracy_file, test_dir, compute_target): scoring_url = PipelineData( name='scoring_url', pipeline_output_name='scoring_url', datastore=accuracy_file.datastore, output_mode='mount', is_directory=False) outputs = [scoring_url] outputs_map = { 'scoring_url': scoring_url } step = PythonScriptStep( script_name='deploy.py', arguments=[ '--model_dir', model_dir, '--accuracy_file', accuracy_file, '--test_dir', test_dir, '--scoring_url', scoring_url ], inputs=[model_dir, accuracy_file, test_dir], outputs=outputs, compute_target=compute_target, source_directory=os.path.dirname(os.path.abspath(__file__)), allow_reuse=False ) return step, outputs_map
def evaluate_step(model_dir, test_dir, compute_target): accuracy_file = PipelineData(name='accuracy_file', pipeline_output_name='accuracy_file', datastore=test_dir.datastore, output_mode='mount', is_directory=False) outputs = [accuracy_file] outputs_map = {'accuracy_file': accuracy_file} estimator = SKLearn(source_directory=os.path.dirname( os.path.abspath(__file__)), entry_script='evaluate.py', compute_target=compute_target) step = EstimatorStep(estimator=estimator, estimator_entry_script_arguments=[ '--test_dir', test_dir, '--model_dir', model_dir, '--accuracy_file', accuracy_file ], inputs=[model_dir, test_dir], outputs=outputs, compute_target=compute_target, allow_reuse=True) return step, outputs_map
def _setup_datastore(self, blob_dataset_name, output_path=None): """ sets up the datastore in azureml. Either retrieves a pre-existing datastore or registers a new one in the workspace. :param str blob_dataset_name: [required] name of the datastore registered with the workspace. If the datastore does not yet exist, the name it will be registered under. :param str output_path: [optional] if registering a datastore for inferencing, the output path for writing back predictions. """ try: self.blob_ds = Datastore.get(self.ws, blob_dataset_name) print("Found Blob Datastore with name: %s" % blob_dataset_name) except HttpOperationError: self.blob_ds = Datastore.register_azure_blob_container( workspace=self.ws, datastore_name=blob_dataset_name, account_name=self.account_name, container_name=self.container_name, account_key=self.account_key, subscription_id=self.blob_sub_id, ) print("Registered blob datastore with name: %s" % blob_dataset_name) if output_path is not None: self.output_dir = PipelineData( name="output", datastore=self.ws.get_default_datastore(), output_path_on_compute=output_path)
def process_step(datastore: Datastore, compute: ComputeTarget, path_on_datastore: str) -> (PipelineData, EstimatorStep): datapath = DataPath(datastore=datastore, path_on_datastore=path_on_datastore) data_path_pipeline_param = (PipelineParameter(name="data", default_value=datapath), DataPathComputeBinding(mode='mount')) seer_tfrecords = PipelineData("tfrecords_set", datastore=datastore, is_directory=True) prep = Estimator(source_directory='.', compute_target=compute, entry_script='prep.py', pip_requirements_file='requirements.txt') prepStep = EstimatorStep(name='Data Preparation', estimator=prep, estimator_entry_script_arguments=[ "--source_path", data_path_pipeline_param, "--target_path", seer_tfrecords ], inputs=[data_path_pipeline_param], outputs=[seer_tfrecords], compute_target=compute) return seer_tfrecords, prepStep
def create_databricks_step( input_dataset: Dataset, compute: ComputeTarget, debug_run: bool) -> Tuple[DatabricksStep, PipelineData]: output_data = PipelineData(name="ParquetFiles", datastore=WS.get_default_datastore(), is_directory=True) node_size = 'Standard_DS4_v2' spark_version = '7.3.x-cpu-ml-scala2.12' db_step = DatabricksStep( name='Convert to Parquet', inputs=[input_dataset.as_named_input("CSVFiles")], outputs=[output_data], source_directory="./safe-driver/prep_data", python_script_name='prep_data.py', python_script_params=["--number-of-files", "1"], # Set the number of output files to 1 num_workers=1, compute_target=compute, pypi_libraries=[], allow_reuse=debug_run, node_type=node_size, spark_version=spark_version, ) return db_step, output_data
def prepare(self): def _regular_name(port_name): # AML Service does not allow name with spaces. Replace them with underscore. return '_'.join(port_name.split()) if not self.prepared: conn = PipelineData(_regular_name(self.name)) self.value = conn
def _get_data_references(self, request_id, internal_datastore): print( 'AMLCompute, _get_data_references() called. Request ID: {}'.format( request_id)) # Argument Datastore Name needs to: only contain alphanumeric characters and _. request_id_to_use_for_datastore = request_id.replace('-', '_') try: # setting the overwrite flag to True overwrites any datastore that was created previously with that name # internal_datastore stores all user-facing files: list of images, detection results, list of failed images # and it so happens that each job also needs the list of images as an input internal_datastore_name = 'internal_datastore_{}'.format( request_id_to_use_for_datastore) internal_account_name = internal_datastore['account_name'] internal_account_key = internal_datastore['account_key'] internal_container_name = internal_datastore['container_name'] internal_datastore = Datastore.register_azure_blob_container( self.ws, internal_datastore_name, internal_container_name, internal_account_name, account_key=internal_account_key) print('internal_datastore done') # output_datastore stores the output from score.py in each job, which is another container # in the same storage account as interl_datastore output_datastore_name = 'output_datastore_{}'.format( request_id_to_use_for_datastore) output_container_name = api_config.AML_CONTAINER output_datastore = Datastore.register_azure_blob_container( self.ws, output_datastore_name, output_container_name, internal_account_name, account_key=internal_account_key) print('output_datastore done') except Exception as e: raise RuntimeError( 'Error in connecting to the datastores for AML Compute: {}'. format(str(e))) try: internal_dir = DataReference(datastore=internal_datastore, data_reference_name='internal_dir', mode='mount') output_dir = PipelineData( 'output_{}'.format(request_id_to_use_for_datastore), datastore=output_datastore, output_mode='mount') print('Finished setting up the Data References.') except Exception as e: raise RuntimeError( 'Error in creating data references for AML Compute: {}.'. format(str(e))) return internal_dir, output_dir
def train_step(train_dir, valid_dir, compute_target): ''' This step will fine-tune a RESNET-18 model on our dataset using PyTorch. It will use the corresponding input image directories as training and validation data. :param train_dir: The reference to the directory containing the training data :type train_dir: DataReference :param valid_dir: The reference to the directory containing the validation data :type valid_dir: DataReference :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The training step, step outputs dictionary (keys: model_dir) :rtype: EstimatorStep, dict ''' num_epochs = PipelineParameter(name='num_epochs', default_value=25) batch_size = PipelineParameter(name='batch_size', default_value=16) learning_rate = PipelineParameter(name='learning_rate', default_value=0.001) momentum = PipelineParameter(name='momentum', default_value=0.9) model_dir = PipelineData( name='model_dir', pipeline_output_name='model_dir', datastore=train_dir.datastore, output_mode='mount', is_directory=True) outputs = [model_dir] outputs_map = { 'model_dir': model_dir } estimator = PyTorch( source_directory=os.path.dirname(os.path.abspath(__file__)), entry_script='train.py', framework_version='1.3', compute_target=compute_target, use_gpu=True) step = EstimatorStep( name="Train Model", estimator=estimator, estimator_entry_script_arguments=[ '--train_dir', train_dir, '--valid_dir', valid_dir, '--output_dir', model_dir, '--num_epochs', num_epochs, '--batch_size', batch_size, '--learning_rate', learning_rate, '--momentum', momentum ], inputs=[train_dir, valid_dir], compute_target=compute_target, outputs=outputs, allow_reuse=False) return step, outputs_map
def data_preprocess_step(raw_data_dir, compute_target): run_config = RunConfiguration() run_config.environment.python.conda_dependencies = CondaDependencies.create( pip_packages=['pandas']) run_config.environment.docker.enabled = True train_dir = PipelineData(name='train_dir', pipeline_output_name='train_dir', datastore=raw_data_dir.datastore, output_mode='mount', is_directory=True) test_dir = PipelineData(name='test_dir', pipeline_output_name='test_dir', datastore=raw_data_dir.datastore, output_mode='mount', is_directory=True) outputs = [train_dir, test_dir] outputs_map = { 'train_dir': train_dir, 'test_dir': test_dir, } step = PythonScriptStep(script_name='data_preprocess.py', arguments=[ '--raw_data_dir', raw_data_dir, '--train_dir', train_dir, '--test_dir', test_dir, ], inputs=[raw_data_dir], outputs=outputs, compute_target=compute_target, runconfig=run_config, source_directory=os.path.dirname( os.path.abspath(__file__)), allow_reuse=True) return step, outputs_map
def get_test_data(aml_interface): datastore = aml_interface.workspace.get_default_datastore() datastore_paths = [(datastore, PREDICTION_FILE)] dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths) registered_iris_ds= dataset.register(workspace=aml_interface.workspace, name=PREDICTION_DATASET_NAME,create_new_version=True) named_iris_ds = registered_iris_ds.as_named_input(PREDICTION_DATASET_NAME) output_folder = PipelineData(name=PARALLEL_TASK_NAME, datastore=datastore) return named_iris_ds,output_folder
def prepare(self): """ Prepare a Port instance to be connected by assigning a PipelineData instance to its value """ def _regular_name(port_name): # AML Service does not allow name with spaces. Replace them with underscore. return '_'.join(port_name.split()) if not self.prepared: conn = PipelineData(_regular_name(self.name)) self.value = conn
def setup_training_step(self): prepped_data = self.prepped_data_path.parse_parquet_files( file_extension=None) project_folder = './automl' automl_config = AutoMLConfig(compute_target=self.aml_compute, task="classification", training_data=prepped_data, label_column_name="test_result", path=project_folder, enable_early_stopping=True, featurization='auto', debug_log="automl_errors.log", n_cross_validations=10, **automl_settings) ds = self.ws.get_default_datastore() metrics_output_name = 'metrics_output' best_model_output_name = 'model_output' metrics_data = PipelineData( name='metrics_data', datastore=ds, pipeline_output_name=metrics_output_name, training_output=TrainingOutput(type='Metrics')) model_data = PipelineData(name='best_model_data', datastore=ds, pipeline_output_name=best_model_output_name, training_output=TrainingOutput(type='Model')) self.model_data = model_data automl_step = AutoMLStep(name='automl_module', automl_config=automl_config, passthru_automl_config=False, outputs=[metrics_data, model_data], allow_reuse=True) return automl_step
def run_script(ws, datastore, pipeline_name, instructions): pipeline_steps = [] last_output = [] for i in range(len(instructions)): compute_target = get_or_create_compute_target( ws, compute_name=instructions[i]['name'], vm_size=instructions[i].get('vm_size', 'STANDARD_D2_V2'), min_nodes=instructions[i].get('min_nodes', 0), max_nodes=instructions[i].get('max_nodes', 4), idle_sec=instructions[i].get('idle_seconds_before_scale_down', 120)) run_config = create_runconfig(compute_target) # input directory in datastore if len(last_output) == 0: input_dir = None # input_dir = DataReference( # datastore=datastore, # data_reference_name=DATA_REFERENCE_NAME + str(i), # path_on_datastore="flows/", # mode='download' # ) else: input_dir = last_output # output directory in datastore output_dir = PipelineData( name=DATA_REFERENCE_NAME + str(i), datastore=datastore, output_path_on_compute=OUTPUT_PATH_ON_COMPUTE) # create pipeline step pipeline_step = create_mlapp_pipeline_step( compute_target, run_config, source_directory=os.getcwd(), entry_script=os.path.join("deployment", "aml_flow.py"), input_dir=input_dir, output_dir=output_dir, param_name='config' + str(i)) # add to pipeline pipeline_steps += pipeline_step # reference last output last_output.append(output_dir) publish_pipeline_endpoint(ws, pipeline_steps, pipeline_name)
def _setup_pipelinedata(self, name, output_path=None): """ helper function to setup a PipelineData object in AzureML :param str name: [required] name of the data object in AzureML :param str output_path: path on output datastore to write data to :returns: output_data :rtype: PipelineData """ if output_path is not None: output_data = PipelineData( name=name, datastore=self.blob_ds, output_name=name, output_mode="mount", output_path_on_compute=output_path, is_directory=True, ) else: output_data = PipelineData( name=name, datastore=self.ws.get_default_datastore(), output_name=name) return output_data
def data_ingestion_step(datastore, compute_target): ''' This step will leverage Azure Cognitive Services to search the web for images to create a dataset. This replicates the real-world scenario of data being ingested from a constantly changing source. The same 10 classes in the CIFAR-10 dataset will be used (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck). :param datastore: The datastore that will be used :type datastore: Datastore :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The ingestion step, step outputs dictionary (keys: raw_data_dir) :rtype: PythonScriptStep, dict ''' run_config = RunConfiguration() run_config.environment.environment_variables = { 'BING_SEARCH_V7_SUBSCRIPTION_KEY': os.environ['BING_SEARCH_V7_SUBSCRIPTION_KEY'], 'BING_SEARCH_V7_ENDPOINT': os.environ['BING_SEARCH_V7_ENDPOINT'], 'AZURE_REGION': datastore._workspace.location } run_config.environment.docker.enabled = True num_images = PipelineParameter(name='num_images', default_value=25) raw_data_dir = PipelineData( name='raw_data_dir', pipeline_output_name='raw_data_dir', datastore=datastore, output_mode='mount', is_directory=True) outputs = [raw_data_dir] outputs_map = { 'raw_data_dir': raw_data_dir } step = PythonScriptStep( name="Data Ingestion", script_name='data_ingestion.py', arguments=['--output_dir', raw_data_dir, '--num_images', num_images], outputs=outputs, compute_target=compute_target, source_directory=os.path.dirname(os.path.abspath(__file__)), runconfig=run_config, allow_reuse=True ) return step, outputs_map
def evaluate_step(model_dir, test_dir, compute_target): ''' This step evaluates the trained model on the testing data and outputs the accuracy. :param model_dir: The reference to the directory containing the trained model :type model_dir: DataReference :param test_dir: The reference to the directory containing the testing data :type test_dir: DataReference :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The evaluate step, step outputs dictionary (keys: accuracy_file) :rtype: EstimatorStep, dict ''' accuracy_file = PipelineData( name='accuracy_file', pipeline_output_name='accuracy_file', datastore=test_dir.datastore, output_mode='mount', is_directory=False) outputs = [accuracy_file] outputs_map = { 'accuracy_file': accuracy_file } estimator = PyTorch( source_directory=os.path.dirname(os.path.abspath(__file__)), entry_script='evaluate.py', framework_version='1.3', compute_target=compute_target, use_gpu=True) step = EstimatorStep( name="Evaluate Model", estimator=estimator, estimator_entry_script_arguments=[ '--test_dir', test_dir, '--model_dir', model_dir, '--accuracy_file', accuracy_file ], inputs=[model_dir, test_dir], outputs=outputs, compute_target=compute_target, allow_reuse=True) return step, outputs_map
def get_pipeline_data(self, config): pipeline_data = [] for c in config: if c["type"] == StepArgParser.ARG_TYPE_PIPELINE_DATA: pconfig = c["config"] pname = pconfig["name"] pds = pconfig.get("datastore") or "default" if pds == "default": use_ds = self.workspace.get_default_datastore() else: use_ds = Datastore.get(workspace=self.workspace, datastore_name=pds) pd = PipelineData(pname, datastore=use_ds) pipeline_data.append(pd) return pipeline_data
def estimator(data, store, compute): estimator = Estimator(source_directory=os.path.dirname( os.path.abspath(__file__)), compute_target=compute, entry_script='train.py', pip_packages=['azureml-dataprep', 'lightgbm']) output = PipelineData("output", datastore=store) step = EstimatorStep(name=os.path.basename(__file__), estimator=estimator, estimator_entry_script_arguments=[ '--input_dir', data, '--output_dir', output ], inputs=[data], outputs=[output], compute_target=estimator._compute_target, allow_reuse=True) return step, output
def deploy_step(model_dir, accuracy_file, test_dir, compute_target): ''' This step registers and deploys a new model on its first run. In subsequent runs, it will only register and deploy a new model if the training dataset has changed or the dataset did not change, but the accuracy improved. :param model_dir: The reference to the directory containing the trained model :type model_dir: DataReference :param accuracy_file: The reference to the file containing the evaluation accuracy :type accuracy_file: DataReference :param test_dir: The reference to the directory containing the testing data :type test_dir: DataReference :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The preprocess step, step outputs dictionary (keys: scoring_url) :rtype: PythonScriptStep, dict ''' scoring_url = PipelineData(name='scoring_url', pipeline_output_name='scoring_url', datastore=accuracy_file.datastore, output_mode='mount', is_directory=False) outputs = [scoring_url] outputs_map = {'scoring_url': scoring_url} step = PythonScriptStep( name="Deploy Model", script_name='deploy.py', arguments=[ '--model_dir', model_dir, '--accuracy_file', accuracy_file, '--test_dir', test_dir, '--scoring_url', scoring_url ], inputs=[model_dir, accuracy_file, test_dir], outputs=outputs, compute_target=compute_target, source_directory=os.path.dirname(os.path.abspath(__file__)), allow_reuse=False) return step, outputs_map
def register_step(datastore: Datastore, input_data: PipelineData, compute: ComputeTarget, build: str) -> (PipelineData, EstimatorStep): seer_model = PipelineData("model", datastore=datastore, is_directory=True) register = Estimator(source_directory='.', compute_target=compute, entry_script='register.py') registerStep = EstimatorStep(name='Model Registration', estimator=register, estimator_entry_script_arguments=[ "--source_path", input_data, "--target_path", seer_model, "--build", build ], inputs=[input_data], outputs=[seer_model], compute_target=compute) return seer_model, registerStep
def train_step(datastore: Datastore, input_data: PipelineData, compute: ComputeTarget) -> (PipelineData, EstimatorStep): seer_training = PipelineData("train", datastore=datastore, is_directory=True) train = Estimator(source_directory='.', compute_target=compute, entry_script='train.py', use_gpu=True, pip_requirements_file='requirements.txt') trainStep = EstimatorStep(name='Model Training', estimator=train, estimator_entry_script_arguments=[ "--source_path", input_data, "--target_path", seer_training, "--epochs", 15, "--batch", 10, "--lr", 0.001 ], inputs=[input_data], outputs=[seer_training], compute_target=compute) return seer_training, trainStep
def build_vocab_step(train_dir, compute_target): ''' This step will take the raw data downloaded from the previous step, preprocess it, and split into train, valid, and test directories. :param train_dir: The reference to the directory containing the training data :type train_src: DataReference :param compute_target: The compute target to run the step on :type compute_target: ComputeTarget :return: The preprocess step, step outputs dictionary (keys: vocab_dir) :rtype: PythonScriptStep, dict ''' run_config = RunConfiguration() run_config.environment.docker.enabled = True run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE run_config.environment.python.user_managed_dependencies = False conda_packages = ['pytorch'] run_config.environment.python.conda_dependencies = CondaDependencies.create( conda_packages=conda_packages) input_col = PipelineParameter(name='input_col', default_value='Title') output_col = PipelineParameter(name='output_col', default_value='Abstract') size = PipelineParameter(name='size', default_value=50000) freq_cutoff = PipelineParameter(name='freq_cutoff', default_value=2) vocab_dir = PipelineData(name='vocab_dir', pipeline_output_name='vocab_dir', datastore=train_dir.datastore, output_mode='mount', is_directory=True) outputs = [vocab_dir] outputs_map = { 'vocab_dir': vocab_dir, } step = PythonScriptStep(name="Build Vocab", script_name='build_vocab.py', arguments=[ '--train_dir', train_dir, '--vocab_dir', vocab_dir, '--input_col', input_col, '--output_col', output_col, '--size', size, '--freq_cutoff', freq_cutoff, ], inputs=[train_dir], outputs=outputs, compute_target=compute_target, runconfig=run_config, source_directory=os.path.dirname( os.path.abspath(__file__)), allow_reuse=True) return step, outputs_map
# The Azure ML pipeline is composed of two steps: # # - Data pre-processing which consist of one-hot encoding categorical features, normalization of the features set, spliting of dataset into training/testing sets and finally writing out the output to storage. # # - Hyperdrive step that tune and train the deep kernel learning model using GPytorch and Pytorch estimator #%% [markdown] # ## Pipeline data input/output # # Here, we define the input and intermediary dataset that will be used by the pipeline steps. #%% input_dir = DataReference(datastore=default_store, data_reference_name="input_data", path_on_datastore="churn") processed_dir = PipelineData(name='processed_data', datastore=default_store) #%% [markdown] # ## Pipeline 1st step: Data Preprocessing # # We start by defining the run configuration with the needed dependencies by the preprocessing step. # # In the cell that follow, we compose the first step of the pipeline. # #%% cd = CondaDependencies() cd.add_conda_package('pandas') cd.add_conda_package('matplotlib') cd.add_conda_package('numpy') cd.add_conda_package('scikit-learn')