class AzureMLEnvironment(env.WorkEnvironment): is_connected: bool = False __config_file: str = '.azureml/config.json' __workspace: Workspace = None __datastore_path: str = 'data' def __init__(self, config_file: str = None, datastore_path: str = None, subscription_id: str = None, connect_workspace: bool = True, resource_group: str = None, workspace_name: str = None, write_config: bool = False, from_context: bool = False): ''' This allows a user to specify to work connected or disconnected. When connecting, the implementation will check if there's a local config file for AzureML and connect to the Azure ML workspace in that case, or take the given Workspace Args: config_file (str): The name of the config file (defaulting to .azureml/config.json) that contains the Workspace parameters datastore_path (str): the name of a DataStore in AzureML that contains Datasets aml_workspace_name (azureml.core.Workspace): An already built Azure ML Workspace object that can be used to work connected ''' self.__datastore_path = datastore_path if(from_context): run = Run.get_context() self.__workspace = run.experiment.workspace self.__print_connection_info() else: # User wants to connect # Setting the config file to the passed argument if config_file: # Since a file name is passed, we should check if it exists self.__config_file = config_file # If workspace parameters are passed, the workspace will be taken and the config file will be ignored if(subscription_id and resource_group and workspace_name): self.__workspace = Workspace(subscription_id, resource_group, workspace_name) if write_config: self.__workspace.write_config(self.__config_file) elif connect_workspace: # A config file is passed, so we'll validate the existance and connect if not os.path.exists(self.__config_file): raise FileNotFoundError('The config file ' + self.__config_file + ' does not exist. Please verify and try again') # There is a config file, so we'll connect self.__connect_from_config_file(self.__config_file) self.is_connected = connect_workspace @classmethod def CreateFromContext(cls, datastore_path: str = None): ''' Creates a WorkEnvironment and returns the correct implementation, based on the configuration Args: datastore_path (str): the name of a DataStore in AzureML that contains Datasets Returns: AzureMLEnvironment: an instance of WorkEnvironment allowing the user to work connected. ''' return cls(datastore_path = datastore_path, from_context=True) @classmethod def Create(cls, subscription_id: str = None, resource_group: str = None, workspace_name: str = None, write_config: bool = False, config_file: str = None, datastore_path: str = None): ''' Creates a WorkEnvironment and returns the correct implementation, based on the configuration Args: subscription_id (str): The subscription id where the AzureML service resides resource_group (str): The resource group that contains the AzureML workspace workspace_name (str): Name of the AzureML workspace write_config (bool): If True, the WorkSpace configuration will be persisted in the given (or default) config file config_file (str): The name of the config file (defaulting to .azureml/config.json) that contains the Workspace parameters datastore_path (str): the name of a DataStore in AzureML that contains Datasets Returns: AzureMLEnvironment: an instance of WorkEnvironment allowing the user to work connected. ''' return cls(config_file = config_file, datastore_path = datastore_path, subscription_id=subscription_id, resource_group=resource_group, workspace_name= workspace_name, write_config = write_config) def load_tabular_dataset(self, dataset_name: str, cloud_storage: bool = True) -> pd.DataFrame: ''' Loads a tabular dataset by a given name. The implementation will load the Dataset by name from the AzureML Workspace When configured locally, the data frame will be loaded from a file in the datastore_path with name {dataset_name}.csv Args: dataset_name (str): The name of the dataset to load cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder Returns: pd.DataFrame: The dataset, loaded as a DataFrame ''' # Connecting data set if cloud_storage: _dataset = Dataset.get_by_name(self.__workspace, name=dataset_name) return _dataset.to_pandas_dataframe() else: _file_name = os.path.join(self.__datastore_path, dataset_name + '.csv') return pd.read_csv(_file_name) def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False, cloud_storage: bool = True) -> pd.DataFrame: ''' Loads a partition from a tabular dataset. The implementation will connect to the DataStore and get all delimited files matching the partition_name When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv Args: partition_name (str): The name of the partition as a wildcard filter. Example: B* will take all files starting with B, ending with csv columns: (np.array): The column names to assign to the dataframe datastore_path (str): The name of a DataStore that contains Datasets cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder Returns: pd.DataFrame: The dataset, loaded as a DataFrame ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path if cloud_storage: # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) try: _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False _aml_dataset = Dataset.Tabular.from_delimited_files(header=_header, path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns _df = _aml_dataset.to_pandas_dataframe() except DatasetValidationError as dsvalex: if 'provided path is not valid' in str(dsvalex): return None else: raise else: # Reading data from sub files in a folder _folder_path = datastore_name _partition_files = glob.glob(_folder_path + '/' + partition_name + '.csv') _record_found = False _df = None for filename in _partition_files: _header = 0 if first_row_header else None df = pd.read_csv(filename, index_col=None, header=_header) if not _record_found: _df = df _record_found = True else: _df = _df.append(df) if not _record_found: return None if columns != None: _df.columns = columns return _df def upload_dataset(self, dataset_name: str, local_folder: str, datastore_name: str = None, overwrite: bool = False, tags: dict = None) -> pd.DataFrame: ''' Uploads data from a local directory into an AzureML Datastore that points to Azure Data lake Args: dataset_name (str): The name of the dataset to register local_folder (str): The location of the local directory to take files from datastore_path (str): The name of a DataStore that will contain the dataset Returns: FileDataset: The registered dataset, containing the files ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) # TODO : check type of datastore datastore.upload(local_folder, dataset_name, overwrite, True) datastore_paths = [(datastore, dataset_name)] file_ds = Dataset.File.from_files(path=datastore_paths) file_ds = file_ds.register(workspace=self.__workspace, name=dataset_name, description=dataset_name, tags = tags, create_new_version=True) def start_experiment(self, name: str) -> aml_trainer.AzureMLTrainer: ''' Creates a new experiment (or connects to an existing one), using the give name Args: name (str): the name of the experiment which whill be used in AzureML Returns: Trainer: a Trainer object that can be used to perform trainings and add logging in AzureML ''' return aml_trainer.AzureMLTrainer(name, self.__workspace) def get_secret(self, secret_name: str) -> str: ''' Reads a secret string from the registered Azure KeyVault Args: secret_name (str): the name of the secret key to be registered in Azure KeyVault Returns: str: the secret value in the KeyVault ''' keyvault = self.__workspace.get_default_keyvault() try: return keyvault.get_secret(name=secret_name) except: return None def isvalid(self) -> bool: return True def get_azureml_workspace(self): return self.__workspace def __connect_from_config_file(self, file_name:str): self.__workspace = Workspace.from_config(_file_name=file_name) self.__print_connection_info() def __print_connection_info(self): print('Connected to AzureML workspace') print('>> Name:', self.__workspace._workspace_name) print('>> Subscription:', self.__workspace.subscription_id) print('>> Resource group:', self.__workspace.resource_group) def capture_filedataset_layout(self, dataset_name: str, output_path: str): from azureml.dataprep.api.functions import get_portable_path from azureml.dataprep import col, get_stream_properties, SummaryColumnsValue, SummaryFunction dataset = self.__workspace.datasets[dataset_name] files_column = 'Path' PORTABLE_PATH = 'PortablePath' STREAM_PROPERTIES = 'StreamProperties' dataflow = dataset._dataflow \ .add_column(get_portable_path(col(files_column), None), PORTABLE_PATH, files_column) \ .add_column(get_stream_properties(col(files_column)), STREAM_PROPERTIES, PORTABLE_PATH) \ .keep_columns([files_column, PORTABLE_PATH, STREAM_PROPERTIES]) dataflow_to_execute = dataflow.add_step('Microsoft.DPrep.WritePreppyBlock', { 'outputPath': { 'target': 0, 'resourceDetails': [{'path': str(output_path)}] }, 'profilingFields': ['Kinds', 'MissingAndEmpty'] }) dataflow_to_execute.run_local() df = dataflow.to_pandas_dataframe(extended_types=True) df = df.merge(pd.io.json.json_normalize(df.StreamProperties), left_index=True, right_index=True) print(f'{len(df.index)} files found in the dataset, totalling to a size of {(df.Size.sum() / (1024 * 1024)):,.2f} MB') return df
config = json.load(f) auth = ServicePrincipalAuthentication( tenant_id=config["tenant_id"], service_principal_id=config["service_principal_id"], service_principal_password=config["service_principal_password"], ) ws = Workspace(config["subscription_id"], config["resource_group"], config["workspace_name"], auth=auth) print(ws.get_details) keyvault = ws.get_default_keyvault() keyvault.set_secret("tenantID", config["tenant_id"]) keyvault.set_secret("servicePrincipalId", config["service_principal_id"]) keyvault.set_secret("servicePrincipalPassword", config["service_principal_password"]) # folder for scripts that need to be uploaded to Aml compute target script_folder = "./scripts/" if os.path.exists(script_folder): print("Deleting:", script_folder) shutil.rmtree(script_folder) os.makedirs(script_folder) shutil.copy(os.path.join(base_dir, "utils.py"), script_folder) shutil.copy(os.path.join(base_dir, "pipelines_slave.py"), script_folder) shutil.copy(os.path.join(base_dir, "train.py"), script_folder)