def _setup_datastore(self, blob_dataset_name, output_path=None): """ sets up the datastore in azureml. Either retrieves a pre-existing datastore or registers a new one in the workspace. :param str blob_dataset_name: [required] name of the datastore registered with the workspace. If the datastore does not yet exist, the name it will be registered under. :param str output_path: [optional] if registering a datastore for inferencing, the output path for writing back predictions. """ try: self.blob_ds = Datastore.get(self.ws, blob_dataset_name) print("Found Blob Datastore with name: %s" % blob_dataset_name) except HttpOperationError: self.blob_ds = Datastore.register_azure_blob_container( workspace=self.ws, datastore_name=blob_dataset_name, account_name=self.account_name, container_name=self.container_name, account_key=self.account_key, subscription_id=self.blob_sub_id, ) print("Registered blob datastore with name: %s" % blob_dataset_name) if output_path is not None: self.output_dir = PipelineData( name="output", datastore=self.ws.get_default_datastore(), output_path_on_compute=output_path)
def prepare(): ws = None try: print("Connecting to workspace '%s'..." % workspace_name) ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) except: print("Workspace not accessible.") print(ws.get_details()) ws.write_config() # # Register an existing datastore to the workspace. # if datastore_name not in ws.datastores: Datastore.register_azure_blob_container( workspace=ws, datastore_name=datastore_name, container_name=blob_container_name, account_name=blob_account_name, account_key=blob_account_key ) print("Datastore '%s' registered." % datastore_name) else: print("Datastore '%s' has already been regsitered." % datastore_name)
def upload_dataset(self, dataset_name: str, local_folder: str, datastore_name: str = None, overwrite: bool = False, tags: dict = None) -> pd.DataFrame: ''' Uploads data from a local directory into an AzureML Datastore that points to Azure Data lake Args: dataset_name (str): The name of the dataset to register local_folder (str): The location of the local directory to take files from datastore_path (str): The name of a DataStore that will contain the dataset Returns: FileDataset: The registered dataset, containing the files ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) # TODO : check type of datastore datastore.upload(local_folder, dataset_name, overwrite, True) datastore_paths = [(datastore, dataset_name)] file_ds = Dataset.File.from_files(path=datastore_paths) file_ds = file_ds.register(workspace=self.__workspace, name=dataset_name, description=dataset_name, tags = tags, create_new_version=True)
def register_datastore(self, datastore_name, blob_container, storage_acct_name, storage_acct_key): Datastore.register_azure_blob_container(workspace=self.workspace, datastore_name=datastore_name, container_name=blob_container, account_name=storage_acct_name, account_key=storage_acct_key)
def __enter__(self): """Download files for datastore. :return: """ module_logger.debug("Enter __enter__ function of datastore cmgr") from azureml.core import Datastore, Dataset for key, value in self._config.items(): df_config, _ = self._to_data_reference_config(value) if self._is_upload(df_config): if df_config.path_on_compute: dir_to_create = os.path.normpath( os.path.dirname(df_config.path_on_compute)) if dir_to_create: _safe_mkdirs(dir_to_create) else: target_path = df_config.data_store_name if df_config.path_on_compute: target_path = os.path.join(df_config.data_store_name, df_config.path_on_compute) # The target_path is always set using the data store name with no way # for the user to overwrite this behavior. The user might attempt to use ../ in # the path on compute as a solution but this throws an exception # because the path is not normalized. # Normalizing the path to allow the user to use up-level references. target_path = os.path.normpath(target_path) if self._is_download(df_config): self._validate_config(df_config, key) ds = Datastore(workspace=self._workspace, name=df_config.data_store_name) if self._is_datastore_adlsgen1(ds): _log_and_print( "AzureDataLake Gen1 used as Datastore for download" ) if df_config.path_on_data_store is None: df_config.path_on_data_store = "" Dataset.File.from_files( (ds, df_config.path_on_data_store)).download( os.path.join(target_path, df_config.path_on_data_store), overwrite=df_config.overwrite) else: count = ds.download( target_path=target_path, prefix=df_config.path_on_data_store, overwrite=df_config.overwrite) if count == 0: import warnings warnings.warn( "Downloaded 0 files from datastore {} with path {}." .format(ds.name, df_config.path_on_data_store)) else: _safe_mkdirs(target_path) module_logger.debug("Exit __enter__ function of datastore cmgr")
def get_datastore(ws: Workspace, datastore_name: str, container: str, account_name: str, account_key: str) -> Datastore: if not datastore_name in ws.datastores: Datastore.register_azure_blob_container(workspace=ws, datastore_name=datastore_name, container_name=container, account_name=account_name, account_key=account_key, create_if_not_exists=True) return ws.datastores[datastore_name]
def register_datastore(workspace, ds_config): ds_name = ds_config.get("name") if not is_datastore_exists(workspace, ds_name): Datastore.register_azure_blob_container( workspace=workspace, datastore_name=ds_name, account_name=ds_config.get("account_name"), container_name=ds_config.get("container_name"), account_key=ds_config.get("account_key"), create_if_not_exists=ds_config.get("create_if_not_exists") )
def main(): # Connect to your AMLS Workspace and set your Datastore ws = run.experiment.workspace datastoreName = args.datastore_name datastore = Datastore.get(ws, datastoreName) print('Datastore Set') # Set your Time Zone timeZone = pytz.timezone(args.pytz_time_zone) timeLocal = dt.datetime.now(timeZone).strftime('%Y-%m-%d') print('Time Zone Set') # Specify your File Names trainFile = timeLocal + '/' + args.train_file_name valFile = timeLocal + '/' + args.val_file_name print('File Names Set for Training and Validation Data.') # Set Tags and Description description = args.project_description trainTags = set_tags(['Project', 'Dataset Type', 'Date Created'],\ [args.project_name, 'Training', timeLocal]) valTags = set_tags(['Project', 'Dataset Type', 'Date Created'],\ [args.project_name, 'Validation', timeLocal]) print("Dataset Tags and Description Assigned") # Register your Training data as an Azure Tabular Dataset register_dataset(ws, datastore, args.datastore_path, trainFile, args.train_dataset_name, description, trainTags) print('Training Data Registered') # Register your Validation data as an Azure Tabular Dataset register_dataset(ws, datastore, args.datastore_path, valFile, args.val_dataset_name, description, valTags) print('Validation Data Registered')
def update_dataset(ws, datastore_name, dataset, time_stamp): datastore = Datastore.get(ws, datastore_name) #datastore = adlsgen2_datastore if dataset["dataset_name"] in ws.datasets: print("Dataset " + dataset["dataset_name"] + " already created in " + ws.name + ", will update to new version...") else: print("Dataset " + dataset["dataset_name"] + " is new and will be created in " + ws.name + "...") # create a TabularDataset from the path in the datastore datastore_paths = [(datastore, dataset["dataset_path"])] retrieved_dataset = Dataset.Tabular.from_delimited_files( path=datastore_paths) #Register the dataset (and make a new version if needed) #The timestamp description to make it easier to see the same # dataset was registered at the same time in different workspaces if you want to filter retrieved_dataset = retrieved_dataset.register( workspace=ws, name=dataset["dataset_name"], description='versioned data, timestamp: ' + time_stamp, create_new_version=True) print("Updated dataset " + dataset["dataset_name"] + " in workspace " + ws.name + " at timestamp " + time_stamp) return retrieved_dataset
def write_results(df, cols, output_datastore, output_path, model, run): ws = run.experiment.workspace datastore = Datastore.get(ws, output_datastore) output_folder = tempfile.TemporaryDirectory(dir="/tmp") filename = os.path.join(output_folder.name, os.path.basename(output_path)) print("Output filename: {}".format(filename)) try: os.remove(filename) except OSError: pass df["ScoredLabels"] = model.predict(df[cols].astype(int).values) print("resultLabels", df["ScoredLabels"].iloc[:10]) df["ScoredProbabilities"] = model.predict_proba( df[cols].astype(int).values)[:, 1] print("resultProbabilities", df["ScoredProbabilities"].iloc[:10]) # set HotelCustomerID to index to remove the column1 columns in the dataframe df = df.set_index("CustomerId") directory_name = os.path.dirname(output_path) print("Extracting Directory {} from path {}".format( directory_name, output_path)) df.to_csv(filename) # Datastore.upload() is supported currently, but is being deprecated by Dataset.File.upload_directory() # datastore.upload(src_dir=output_folder.name, target_path=directory_name, overwrite=False, show_progress=True) # upload_directory can fail sometimes. output_dataset = Dataset.File.upload_directory(src_dir=output_folder.name, target=(datastore, directory_name)) return df
def setup_azureml(): """ Get an Azure ML workspace from environment variables. Assumes the following are created outside of the code in this project: AML workspace AML datastore AML compute resource for training (can be blank for inferencing) AML compute resource for inferencing (can be blank for training) """ subscription_id = os.environ['AML_SUBSCRIPTION'] resource_group = os.environ['AML_RESOURCE_GROUP'] workspace_name = os.environ['AML_WORKSPACE'] datastore_name = os.environ['AML_DATASTORE'] training_target_name = os.environ.get('AML_COMPUTE') inference_target_name = os.environ.get('AML_INFERENCE_COMPUTE') ws = Workspace(subscription_id, resource_group, workspace_name) ds = Datastore.get(ws, datastore_name=datastore_name) if training_target_name: training_target = ws.compute_targets[training_target_name] else: training_target = None if inference_target_name: inference_target = ws.compute_targets[inference_target_name] else: inference_target = None return ws, ds, training_target, inference_target
def register_sql_datastore( workspace: Workspace, sql_datastore_name: str, sql_server_name: str, sql_database_name: str, sql_username: str, sql_password: str, ) -> AzureSqlDatabaseDatastore: """ Register a Azure SQL DB with the Azure Machine Learning Workspace :param workspace: Azure Machine Learning Workspace :param sql_datastore_name: Name used to id the SQL Datastore :param sql_server_name: Azure SQL Server Name :param sql_database_name: Azure SQL Database Name :param sql_username: Azure SQL Database Username :param sql_password: Azure SQL Database Password :return: Pointer to Azure Machine Learning SQL Datastore """ return Datastore.register_azure_sql_database( workspace=workspace, datastore_name=sql_datastore_name, server_name=sql_server_name, database_name=sql_database_name, username=sql_username, password=sql_password, )
def __init__(self): self._parser = argparse.ArgumentParser("evaluate") self._parser.add_argument( "--release_id", type=str, help="The ID of the release triggering this pipeline run") self._parser.add_argument("--model_name", type=str, help="Name of the tf model") self._parser.add_argument("--ckpt_path", type=str, help="Chekpoint path", default="checkpoint/yolov3.ckpt") self._parser.add_argument("--datastore", type=str, help="Name of the datastore", default="epis_datastore") self._parser.add_argument("--storage_container", type=str, help="Name of the storage container", default="ppe") self._args = self._parser.parse_args() self._run = Run.get_context() self._exp = self._run.experiment self._ws = self._run.experiment.workspace self._datastore = Datastore.get(self._ws, datastore_name=self._args.datastore) self._INPUT_SIZE = 416 self._NUM_CLASS = len(utils.read_class_names(cfg.YOLO.CLASSES)) self._CLASSES = utils.read_class_names(cfg.YOLO.CLASSES) self._predicted_dir_path = 'mAP/predicted' self._ground_truth_dir_path = 'mAP/ground-truth'
def __init__(self): self._parser = argparse.ArgumentParser("train") self._parser.add_argument( "--release_id", type=str, help="The ID of the release triggering this pipeline run") self._parser.add_argument("--model_name", type=str, help="Name of the tf model") self._parser.add_argument("--ckpt_path", type=str, help="Chekpoint path", default="checkpoint/yolov3.ckpt") self._parser.add_argument("--datastore", type=str, help="Name of the datastore", default="epis_datastore") self._parser.add_argument("--storage_container", type=str, help="Name of the storage container", default="ppe") self._args = self._parser.parse_args() self._run = Run.get_context() self._exp = self._run.experiment self._ws = self._run.experiment.workspace self._tb = Tensorboard([self._run]) self._datastore = Datastore.get(self._ws, datastore_name=self._args.datastore)
def main(): # workspace ws = Workspace.from_config() #compute compute = AmlCompute(workspace=ws, name='gandalf') # datasource datastore = Datastore.get(ws, datastore_name='surfrider') # experiment script_params = { "--datastore": datastore.as_mount() } # Create and run experiment estimator = Estimator(source_directory='./', script_params=script_params, compute_target=compute, entry_script='train.py', use_gpu=True, pip_packages=['opencv-python>=4.1', 'tensorpack==0.9.8', 'tensorflow-gpu>=1.3,<2.0', 'tqdm>=4.36.1', 'cython>=0.29.13', 'scipy>=1.3.1', 'ffmpeg-python', 'wget']) exp = Experiment(ws, 'surfrider_rcnn') run = exp.submit(estimator)
def register_dataset( aml_workspace: Workspace, dataset_name: str, datastore_name: str, file_path: str = "COVID19Articles.csv", ) -> Dataset: if (datastore_name): datastore = Datastore.get(aml_workspace, datastore_name) else: datastore = Datastore.get_default(aml_workspace) dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) dataset = dataset.register(workspace=aml_workspace, name=dataset_name, create_new_version=True) return dataset
def _create_datastore( aml_workspace, datastore_name, container_name, account_name, account_key, create_if_not_exists=True, ): """Creates datastore Args: datastore_name (string): Name you wish to assign to your datastore. container_name (string): Name of your container. account_name (string): Storage account name. account_key (string): The storage account key. Returns: azureml.core.Datastore """ logger = logging.getLogger(__name__) ds = Datastore.register_azure_blob_container( workspace=aml_workspace, datastore_name=datastore_name, container_name=container_name, account_name=account_name, account_key=account_key, create_if_not_exists=create_if_not_exists, ) logger.info(f"Registered existing blob storage: {ds.name}.") return ds
def __init__(self): self.__parser = argparse.ArgumentParser("preprocessing") self.__parser.add_argument("--datastore", type=str, help="Name of the datastore", default="workspaceblobstore") self.__parser.add_argument("--dataset_name", type=str, help="Name of the dataset") self.__parser.add_argument("--dataset_preprocessed_name", type=str, help="Standard preprocessed dataset") self.__parser.add_argument("--output_preprocess_dataset", type=str, help="Name of the PipelineData reference") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'exploratory_analysis') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__datastore = Datastore.get(self.__ws, datastore_name=self.__args.datastore)
def create_pipeline(self): ''' IRIS Data training and Validation ''' self.datastore = Datastore.get(self.workspace, self.workspace.get_default_datastore().name) print("Received datastore") input_ds = self.get_files_from_datastore(self.args.container_name,self.args.input_csv) final_df = input_ds.to_pandas_dataframe() print("Input DF Info",final_df.info()) print("Input DF Head",final_df.head()) X = final_df[["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]] y = final_df[["Species"]] X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=1984) model = DecisionTreeClassifier() model.fit(X_train,y_train) y_pred = model.predict(X_test) print("Model Score : ", model.score(X_test,y_test)) joblib.dump(model, self.args.model_path) self.validate(y_test, y_pred, X_test) match = re.search('([^\/]*)$', self.args.model_path) # Upload Model to Run artifacts self.run.upload_file(name=self.args.artifact_loc + match.group(1), path_or_stream=self.args.model_path) print("Run Files : ", self.run.get_file_names()) self.run.complete()
def get_by_data_reference(cls, workspace, path): data_store = Datastore(workspace, cls.DEFAULT_GLOBAL_DATASET_STORE) return DataReference( datastore=data_store, data_reference_name=cls.DEFAULT_DATA_REFERENCE_NAME, path_on_datastore=path, )
def ConnectToAzure(): """ Connect to Azure workspace, Compute Target, DataStore and Experiement """ # Connect to workspace # config.json file expected in ./azureml directory # config.json can be generated from the azure portal while browsing the workspace global az_workspace az_workspace = Workspace.from_config() print("Workspace:", az_workspace.name) # Connect to compute for training # compute target must belong to the workspace AND compute targets are limited by the workspace region # there may be ability to do cross workspace compute targets in the future global az_computetarget az_computetarget = ComputeTarget(workspace=az_workspace, name="AzPytrch-NC6") print("Compute Target:", az_computetarget.name) # Connect to the datastore for the training images # datastore must be associated with storage account belonging to workspace global az_datastore az_datastore = Datastore.get_default(az_workspace) print("Datastore:", az_datastore.name) # Connect to the experiment global az_experiment az_experiment = Experiment(workspace=az_workspace, name='616_Final') print("Experiment:", az_experiment.name)
def register_blob_datastore( workspace: Workspace, blob_datastore_name: str, container_name: str, account_name: str, account_key: str, datastore_rg: str, ) -> AzureBlobDatastore: """ Register a Blob Storage Account with the Azure Machine Learning Workspace :param workspace: Azure Machine Learning Workspace :param blob_datastore_name: Name for blob datastore :param container_name: Name for blob container :param account_name: Name for blob account :param account_key: Blob Account Key using for auth :param datastore_rg: Resource Group containing Azure Storage Account :return: Pointer to Azure Machine Learning Blob Datastore """ return Datastore.register_azure_blob_container( workspace=workspace, datastore_name=blob_datastore_name, container_name=container_name, account_name=account_name, account_key=account_key, resource_group=datastore_rg, overwrite=True, )
def load_tabular_partition(self, partition_name: str, datastore_name: str = None, columns: np.array = None, first_row_header: bool = False, cloud_storage: bool = True) -> pd.DataFrame: ''' Loads a partition from a tabular dataset. The implementation will connect to the DataStore and get all delimited files matching the partition_name When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv Args: partition_name (str): The name of the partition as a wildcard filter. Example: B* will take all files starting with B, ending with csv columns: (np.array): The column names to assign to the dataframe datastore_path (str): The name of a DataStore that contains Datasets cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder Returns: pd.DataFrame: The dataset, loaded as a DataFrame ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path if cloud_storage: # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) try: _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False _aml_dataset = Dataset.Tabular.from_delimited_files( header=_header, path=DataPath(datastore, '/' + partition_name + '.csv')) #, set_column_types=columns _df = _aml_dataset.to_pandas_dataframe() except DatasetValidationError as dsvalex: if 'provided path is not valid' in str(dsvalex): return None else: raise else: # Reading data from sub files in a folder _folder_path = datastore_name _partition_files = glob.glob(_folder_path + '/' + partition_name + '.csv') _record_found = False _df = None for filename in _partition_files: _header = 0 if first_row_header else None df = pd.read_csv(filename, index_col=None, header=_header) if not _record_found: _df = df _record_found = True else: _df = _df.append(df) if not _record_found: return None if columns != None: _df.columns = columns return _df
def convert_voc_annotation(ws, ds, data_type, anno_path, container_name, use_difficult_bbox=True): classes = ['helmet', 'none'] datastore = Datastore.get(ws, datastore_name=ds) voc_dataset_annotations = datastore.blob_service.list_blobs( container_name, prefix='VOC/Annotations') voc_dataset_images = datastore.blob_service.list_blobs( container_name, prefix='VOC/JPEGImages') voc_dataset_imagesets = datastore.blob_service.list_blobs( container_name, prefix=f'VOC/ImageSets/Main/{data_type}.txt') voc_list_annotations = list(voc_dataset_annotations) print("Succesfully list annotations") voc_list_images = list(voc_dataset_images) print("Succesfully list images") voc_list_imagesets = list(voc_dataset_imagesets) print("Succesfully list imagesets") txt = datastore.blob_service.get_blob_to_text(container_name, voc_list_imagesets[0].name) txt_split = txt.content.splitlines() image_inds = [line.strip() for line in txt_split] with open(anno_path, 'a') as f: for image_ind in image_inds: image_path = datastore.blob_service.make_blob_url( container_name, 'VOC/JPEGImages/' + image_ind + '.jpg') annotation = image_path label_path = datastore.blob_service.get_blob_to_text( container_name, 'VOC/Annotations/' + image_ind + '.xml').content root = ET.fromstring(label_path) objects = root.findall('object') for obj in objects: difficult = obj.find('difficult').text.strip() if (not use_difficult_bbox) and (int(difficult) == 1): continue bbox = obj.find('bndbox') class_ind = classes.index( obj.find('name').text.lower().strip()) xmin = bbox.find('xmin').text.strip() xmax = bbox.find('xmax').text.strip() ymin = bbox.find('ymin').text.strip() ymax = bbox.find('ymax').text.strip() annotation += ' ' + ','.join( [xmin, ymin, xmax, ymax, str(class_ind)]) print(annotation) f.write(annotation + "\n") datastore.blob_service.create_blob_from_path( container_name, anno_path, anno_path, content_settings=ContentSettings( content_type=__get_mime_type(anno_path)))
def _get_datastore_and_path(self, config): from azureml.core import Datastore output_location = config["OutputLocation"] data_path = output_location["DataPath"] datastore = Datastore(self._workspace, data_path["DatastoreName"]) return datastore, data_path["RelativePath"]
def register_dataset(aml_workspace: Workspace, dataset_name: str, datastore_name: str, file_path: str) -> Dataset: datastore = Datastore.get(aml_workspace, datastore_name) dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) dataset = dataset.register(workspace=aml_workspace, name=dataset_name, create_new_version=True) return dataset
def register_dataset(workspace, datastore_name, dataset_name, file_path): datastore = Datastore.get(workspace=workspace, datastore_name=datastore_name) dataset = Dataset.Tabular.from_delimited_files(path=(datastore, file_path)) dataset = dataset.regisetr( workspace=workspace, name=dataset_name, create_new_version=True ) #either create new version if existed of exist_ok=True return dataset
def get_datastore(): env = EnvironmentVariables() datastore_name = env.datastore_name storage_account_name = env.storage_account_name storage_container_name = env.storage_container_name storage_account_key = env.storage_account_key workspace = get_workspace() try: datastore = Datastore.get(workspace=workspace, datastore_name=datastore_name) except HttpOperationError: datastore = Datastore.register_azure_blob_container( workspace=workspace, datastore_name=datastore_name, account_name=storage_account_name, container_name=storage_container_name, account_key=storage_account_key) return datastore
def main(_): # Export the trained model if not os.path.exists(FLAGS.export_dir): os.makedirs(FLAGS.export_dir) run.log('accuracy', float(0.91)) run.log('val_accuracy', float(0.901)) datastore = Datastore.get(ws, 'mtcseattle') datastore.download(FLAGS.export_dir, prefix="model")
def prepare_data(workspace): datastore = Datastore.get(workspace, TRAINING_DATASTORE) x_train = get_df_from_datastore_path(datastore, 'train/X_train.csv') y_train = get_df_from_datastore_path(datastore, 'train/y_train.csv') y_train = y_train['Target'] x_test = get_df_from_datastore_path(datastore, 'test/X_test.csv') y_test = get_df_from_datastore_path(datastore, 'test/y_test.csv') y_test = y_test['Target'] x_train = remove_collinear_cols(x_train) x_test = remove_collinear_cols(x_test) return x_train, y_train, x_test, y_test