def __exit__(self, *exc_details): """Upload files for datastore. :param exc_details: :return: """ from azureml.core.datastore import Datastore from azureml.data._dataprep_helper import dataprep module_logger.debug("Enter __exit__ function of datastore cmgr") for key, value in self._config.items(): df_config, force_read = self._to_data_reference_config(value) if self._is_upload(df_config): self._validate_config(df_config, key) ds = Datastore(workspace=self._workspace, name=df_config.data_store_name) if os.path.isdir(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload dir." ) dataprep().api.engineapi.api.get_engine_api( ).upload_directory( dataprep().api.engineapi.typedefinitions. UploadDirectoryMessageArguments( base_path=df_config.path_on_compute, folder_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite, concurrent_task_count=1)) else: ds.upload(src_dir=df_config.path_on_compute, target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) elif os.path.isfile(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload file." ) dataprep().api.engineapi.api.get_engine_api( ).upload_file( dataprep().api.engineapi.typedefinitions. UploadFileMessageArguments( base_path=os.path.dirname( df_config.path_on_compute), local_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite)) else: ds.upload_files( files=[df_config.path_on_compute], target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) module_logger.debug("Exit __exit__ function of datastore cmgr")
def upload_dataset(self, dataset_name: str, local_folder: str, datastore_name: str = None, overwrite: bool = False, tags: dict = None) -> pd.DataFrame: ''' Uploads data from a local directory into an AzureML Datastore that points to Azure Data lake Args: dataset_name (str): The name of the dataset to register local_folder (str): The location of the local directory to take files from datastore_path (str): The name of a DataStore that will contain the dataset Returns: FileDataset: The registered dataset, containing the files ''' if not datastore_name: # No datastore name is given, so we'll take the default one datastore_name = self.__datastore_path # Connecting data store datastore = Datastore(self.__workspace, name=datastore_name) # TODO : check type of datastore datastore.upload(local_folder, dataset_name, overwrite, True) datastore_paths = [(datastore, dataset_name)] file_ds = Dataset.File.from_files(path=datastore_paths) file_ds = file_ds.register(workspace=self.__workspace, name=dataset_name, description=dataset_name, tags = tags, create_new_version=True)
# In[ ]: # Default datastore (Azure file storage) def_file_store = ws.get_default_datastore() print("Default datastore's name: {}".format(def_file_store.name)) def_blob_store = Datastore(ws, "workspaceblobstore") print("Blobstore's name: {}".format(def_blob_store.name)) # In[ ]: # Upload the raw training data to the blob storage def_blob_store.upload(src_dir=data_location, target_path='nyc-taxi-raw-features', overwrite=True, show_progress=True) raw_train_data = DataReference(datastore=def_blob_store, data_reference_name="nyc_taxi_raw_features", path_on_datastore="nyc-taxi-raw-features/nyc-taxi-sample-data.csv") print("DataReference object created") # ### Create the Process Training Data Pipeline Step # The intermediate data (or output of a Step) is represented by PipelineData object. PipelineData can be produced by one step and consumed in another step by providing the PipelineData object as an output of one step and the input of one or more steps. # # The process training data pipeline step takes the raw_train_data DataReference object as input, and it will output an intermediate PipelineData object that holds the processed training data with the new engineered features for datetime components: hour of the day, and day of the week. # # Review and run the cell below to construct the PipelineData objects and the PythonScriptStep pipeline step:
#%% first save the files to disk if (not os.path.exists("./Upload")): os.mkdir("./Upload") os.mkdir("./Upload/Data") os.mkdir("./Upload/Model") df_pca.to_csv("./Upload/Data/data.csv", index=False) pickle.dump( pca_model, open( "./Upload/Model/model.pkl", "wb" ) ) #%% now you can upload that directory to blobstorage # I use the date to diferentiate the different versions blob_path = f"Campus_Recruitment/{datetime.now().strftime('%Y-%m-%d')}"# if None will upload to root local_path = "./Upload/Data" blob_store.upload(src_dir=local_path, target_path=blob_path, overwrite=True, show_progress=True) #%% # ** Register the data as a dataset ** # %% now that the data is up on the blobstore we can register it as a dataset # to keep track of its versions and make it easily acessible dataset = Dataset.File.from_files( blob_store.path(blob_path + "/data.csv") ) dataset.register(ws, name="Campus_Recruitment_PCA_Training_Data", create_new_version=True) #%% # ** Upload and register the model as a Model ** #%% model = Model.register(workspace=ws,
# split the data using scikit-learn X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(test_size), random_state=101) # join train and train label; same for test train = pd.concat([X_train, y_train], axis=1) validation = pd.concat([X_test, y_test], axis=1) # make sure folder_name was passed in as an argument if not (folder_name is None): os.makedirs("files", exist_ok=True) print("%s created" % folder_name) # set the target path of the datastore to hold # test and validation datasets current_folder = str(datetime.now().date()) target_path = os.path.join(folder_name, current_folder) train_file = os.path.join("files", train_file_name) val_file = os.path.join("files", val_file_name) # save the dataframes to the local drive to the upload the contents of thefolder train.to_csv(train_file, header=True, index=False) validation.to_csv(val_file, header=True, index=False) datastore.upload("files", target_path=target_path, overwrite=True, show_progress=False)