def __exit__(self, *exc_details): """Upload files for datastore. :param exc_details: :return: """ from azureml.core.datastore import Datastore from azureml.data._dataprep_helper import dataprep module_logger.debug("Enter __exit__ function of datastore cmgr") for key, value in self._config.items(): df_config, force_read = self._to_data_reference_config(value) if self._is_upload(df_config): self._validate_config(df_config, key) ds = Datastore(workspace=self._workspace, name=df_config.data_store_name) if os.path.isdir(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload dir." ) dataprep().api.engineapi.api.get_engine_api( ).upload_directory( dataprep().api.engineapi.typedefinitions. UploadDirectoryMessageArguments( base_path=df_config.path_on_compute, folder_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite, concurrent_task_count=1)) else: ds.upload(src_dir=df_config.path_on_compute, target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) elif os.path.isfile(df_config.path_on_compute): if self._is_datastore_adlsgen1(ds): module_logger.debug( "AzureDataLake Gen1 used as Datastore for upload file." ) dataprep().api.engineapi.api.get_engine_api( ).upload_file( dataprep().api.engineapi.typedefinitions. UploadFileMessageArguments( base_path=os.path.dirname( df_config.path_on_compute), local_path=df_config.path_on_compute, destination=dataprep( ).api._datastore_helper._to_stream_info_value( ds, df_config.path_on_data_store), force_read=force_read, overwrite=df_config.overwrite)) else: ds.upload_files( files=[df_config.path_on_compute], target_path=df_config.path_on_data_store, overwrite=df_config.overwrite) module_logger.debug("Exit __exit__ function of datastore cmgr")
def main(args): # Load workspace print("Loading Workspace") workspace = Workspace.from_config() print( f"Workspace name: {workspace.name}", f"Azure region: {workspace.location}", f"Subscription id: {workspace.subscription_id}", f"Resource group: {workspace.resource_group}", sep="\n" ) # Printing all datastores print("Printing all datastores") for name, datastore in workspace.datastores.items(): print(name, datastore.datastore_type, sep="\t") # Load datastore print("Loading datastore") datastore = Datastore( workspace=workspace, name=args.datastore_name ) # Upload dataset print("Uploading dataset") datastore.upload_files( files=["./train_dataset/iris.csv"], target_path="train_dataset/iris.csv", overwrite=True, show_progress=True ) # Register dataset file_dataset = Dataset.File.from_files( )
}, delimiter=",", header=None) df_log.columns = [ 'ModelType', 'FileName', 'ModelName', 'StartTime', 'EndTime', 'Duration', 'Index', 'BatchSize', 'Status' ] df_log['ModelType'] = df_log['ModelType'].apply(str).str.replace("'", '') df_log['FileName'] = df_log['FileName'].apply(str).str.replace("'", '') df_log['ModelName'] = df_log['ModelName'].apply(str).str.replace("'", '') df_log['StartTime'] = df_log['StartTime'].apply(str).str.replace("'", '') df_log['EndTime'] = df_log['EndTime'].apply(str).str.replace("'", '') df_log['Duration'] = df_log['Duration'].apply(str).str.replace("'", '') df_log['Status'] = df_log['Status'].apply(str).str.replace("'", '') print(df_log.head()) print('Read and cleaned the log file') # save the log file output_path = os.path.join('./logs/', 'training_log') df_log.to_csv(path_or_buf=output_path + '.csv', index=False) print('Saved the training_log.csv') # upload the log file log_dstore = Datastore(ws, args.datastore) log_dstore.upload_files(['./logs/training_log' + '.csv'], target_path='training_log_' + str(datetime.datetime.now().date()), overwrite=args.overwrite_logs, show_progress=True) print('Uploaded the training_log.csv')
def load(quick_run, data_path, cache_path, model_name, num_gpus, random_seed): # Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs. QUICK_RUN = quick_run # Wikigold dataset DATA_URL = ( "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets" "/master/data/wikigold/CONLL-format/data/wikigold.conll.txt" ) # fraction of the dataset used for testing TEST_DATA_FRACTION = 0.3 # sub-sampling ratio SAMPLE_RATIO = 1 # the data path used to save the downloaded data file DATA_PATH = data_path # the cache data path during find tuning CACHE_DIR = cache_path if not os.path.exists(os.path.dirname(DATA_PATH)): os.mkdir(os.path.dirname(DATA_PATH)) if not os.path.exists(DATA_PATH): os.mkdir(DATA_PATH) if not os.path.exists(CACHE_DIR): os.mkdir(CACHE_DIR) # set random seeds RANDOM_SEED = random_seed torch.manual_seed(RANDOM_SEED) MODEL_NAME = model_name # MODEL_NAME = "distilbert" DO_LOWER_CASE = False MAX_SEQ_LENGTH = 200 TRAILING_PIECE_TAG = "X" NUM_GPUS = num_gpus BATCH_SIZE = 16 # update variables for quick run option if QUICK_RUN: SAMPLE_RATIO = 0.1 NUM_TRAIN_EPOCHS = 1 # download data file_name = DATA_URL.split("/")[-1] # a name for the downloaded file maybe_download(DATA_URL, file_name, DATA_PATH) data_file = os.path.join(DATA_PATH, file_name) # parse CoNll file sentence_list, labels_list = read_conll_file(data_file, sep=" ", encoding='utf-8') # sub-sample (optional) random.seed(RANDOM_SEED) sample_size = int(SAMPLE_RATIO * len(sentence_list)) sentence_list, labels_list = list( zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size)) ) # train-test split train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split( sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED ) processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR) label_map = TokenClassificationProcessor.create_label_map( label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG ) train_dataset = processor.preprocess( text=train_sentence_list, max_len=MAX_SEQ_LENGTH, labels=train_labels_list, label_map=label_map, trailing_piece_tag=TRAILING_PIECE_TAG, ) # train_data_loader = DataLoader(train_dataset) test_dataset = processor.preprocess( text=test_sentence_list, max_len=MAX_SEQ_LENGTH, labels=test_labels_list, label_map=label_map, trailing_piece_tag=TRAILING_PIECE_TAG, ) torch.save(train_dataset, os.path.join(DATA_PATH, 'train.pt')) torch.save(test_dataset, os.path.join(DATA_PATH, 'test.pt')) torch.save(label_map, os.path.join(DATA_PATH, 'label_map.pt')) # Default datastore def_data_store = ws.get_default_datastore() # Get the blob storage associated with the workspace def_blob_store = Datastore(ws, "workspaceblobstore") # Get file storage associated with the workspace def_file_store = Datastore(ws, "workspacefilestore") try: def_blob_store.upload_files( [os.path.join(DATA_PATH, 'train.pt')], target_path="nerdata", overwrite=True, show_progress=True) except Exception as e: print(f"Failed to upload -> {e}") try: def_blob_store.upload_files( [os.path.join(DATA_PATH, 'test.pt')], target_path="nerdata", overwrite=True, show_progress=True) except Exception as e: print(f"Failed to upload -> {e}") try: def_blob_store.upload_files( [os.path.join(DATA_PATH, 'label_map.pt')], target_path="nerdata", overwrite=True, show_progress=True) except Exception as e: print(f"Failed to upload -> {e}") train_datastore_paths = [(def_blob_store, 'nerdata/train.pt')] test_datastore_paths = [(def_blob_store, 'nerdata/test.pt')] label_map_datastore_paths = [(def_blob_store, 'nerdata/label_map.pt')] # def_blob_store.upload(src_dir=DATA_PATH, target_path="nerdata", overwrite=True, show_progress=True) train_ds = Dataset.File.from_files(path=train_datastore_paths) test_ds = Dataset.File.from_files(path=test_datastore_paths) label_map_ds = Dataset.File.from_files(path=label_map_datastore_paths) train_ds = train_ds.register(workspace=ws, name='ner_bert_train_ds', description='Named Entity Recognition with BERT (Training set)', create_new_version=False) test_ds = test_ds.register(workspace=ws, name='ner_bert_test_ds', description='Named Entity Recognition with BERT (Testing set)', create_new_version=False) label_map_ds = label_map_ds.register(workspace=ws, name='ner_bert_label_map_ds_ds', description='Named Entity Recognition with BERT (Testing set)', create_new_version=False) train_dataloader = dataloader_from_dataset( train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False ) test_dataloader = dataloader_from_dataset( test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False ) return (train_dataloader, test_dataloader, label_map)
# Default datastore def_data_store = ws.get_default_datastore() # Get the blob storage associated with the workspace def_blob_store = Datastore(ws, "workspaceblobstore") # Get file storage associated with the workspace def_file_store = Datastore(ws, "workspacefilestore") # In[123]: def_blob_store.upload_files( ["Downloads/005930.KS.csv"], target_path="xyz", overwrite=True) # In[ ]: def_blob_store.upload_files( ["./data/20news.pkl"], target_path="20newsgroups", overwrite=True) # In[129]: