Ejemplo n.º 1
0
    def __exit__(self, *exc_details):
        """Upload files for datastore.

        :param exc_details:
        :return:
        """
        from azureml.core.datastore import Datastore
        from azureml.data._dataprep_helper import dataprep

        module_logger.debug("Enter __exit__ function of datastore cmgr")
        for key, value in self._config.items():
            df_config, force_read = self._to_data_reference_config(value)
            if self._is_upload(df_config):
                self._validate_config(df_config, key)
                ds = Datastore(workspace=self._workspace,
                               name=df_config.data_store_name)
                if os.path.isdir(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload dir."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_directory(
                            dataprep().api.engineapi.typedefinitions.
                            UploadDirectoryMessageArguments(
                                base_path=df_config.path_on_compute,
                                folder_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite,
                                concurrent_task_count=1))
                    else:
                        ds.upload(src_dir=df_config.path_on_compute,
                                  target_path=df_config.path_on_data_store,
                                  overwrite=df_config.overwrite)
                elif os.path.isfile(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload file."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_file(
                            dataprep().api.engineapi.typedefinitions.
                            UploadFileMessageArguments(
                                base_path=os.path.dirname(
                                    df_config.path_on_compute),
                                local_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite))
                    else:
                        ds.upload_files(
                            files=[df_config.path_on_compute],
                            target_path=df_config.path_on_data_store,
                            overwrite=df_config.overwrite)
        module_logger.debug("Exit __exit__ function of datastore cmgr")
Ejemplo n.º 2
0
def main(args):
    # Load workspace
    print("Loading Workspace")
    workspace = Workspace.from_config()
    print(
        f"Workspace name: {workspace.name}", 
        f"Azure region: {workspace.location}", 
        f"Subscription id: {workspace.subscription_id}", 
        f"Resource group: {workspace.resource_group}",
        sep="\n"
    )

    # Printing all datastores
    print("Printing all datastores")
    for name, datastore in workspace.datastores.items():
        print(name, datastore.datastore_type, sep="\t")
    
    # Load datastore
    print("Loading datastore")
    datastore = Datastore(
        workspace=workspace,
        name=args.datastore_name
    )

    # Upload dataset
    print("Uploading dataset")
    datastore.upload_files(
        files=["./train_dataset/iris.csv"],
        target_path="train_dataset/iris.csv",
        overwrite=True,
        show_progress=True
    )

    # Register dataset
    file_dataset = Dataset.File.from_files(
        
    )
Ejemplo n.º 3
0
                     },
                     delimiter=",",
                     header=None)
df_log.columns = [
    'ModelType', 'FileName', 'ModelName', 'StartTime', 'EndTime', 'Duration',
    'Index', 'BatchSize', 'Status'
]
df_log['ModelType'] = df_log['ModelType'].apply(str).str.replace("'", '')
df_log['FileName'] = df_log['FileName'].apply(str).str.replace("'", '')
df_log['ModelName'] = df_log['ModelName'].apply(str).str.replace("'", '')
df_log['StartTime'] = df_log['StartTime'].apply(str).str.replace("'", '')
df_log['EndTime'] = df_log['EndTime'].apply(str).str.replace("'", '')
df_log['Duration'] = df_log['Duration'].apply(str).str.replace("'", '')
df_log['Status'] = df_log['Status'].apply(str).str.replace("'", '')
print(df_log.head())
print('Read and cleaned the log file')

# save the log file
output_path = os.path.join('./logs/', 'training_log')
df_log.to_csv(path_or_buf=output_path + '.csv', index=False)
print('Saved the training_log.csv')

# upload the log file
log_dstore = Datastore(ws, args.datastore)
log_dstore.upload_files(['./logs/training_log' + '.csv'],
                        target_path='training_log_' +
                        str(datetime.datetime.now().date()),
                        overwrite=args.overwrite_logs,
                        show_progress=True)
print('Uploaded the training_log.csv')
def load(quick_run, data_path, cache_path, model_name, num_gpus, random_seed):

	# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.
	QUICK_RUN = quick_run

	# Wikigold dataset
	DATA_URL = (
		"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
		"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
	)

	# fraction of the dataset used for testing
	TEST_DATA_FRACTION = 0.3

	# sub-sampling ratio
	SAMPLE_RATIO = 1

	# the data path used to save the downloaded data file
	DATA_PATH = data_path

	# the cache data path during find tuning
	CACHE_DIR = cache_path

	if not os.path.exists(os.path.dirname(DATA_PATH)):
		os.mkdir(os.path.dirname(DATA_PATH))
		if not os.path.exists(DATA_PATH):
			os.mkdir(DATA_PATH)
		if not os.path.exists(CACHE_DIR):
			os.mkdir(CACHE_DIR)

	# set random seeds
	RANDOM_SEED = random_seed
	torch.manual_seed(RANDOM_SEED)


	MODEL_NAME = model_name
	# MODEL_NAME = "distilbert"
	DO_LOWER_CASE = False
	MAX_SEQ_LENGTH = 200
	TRAILING_PIECE_TAG = "X"
	NUM_GPUS = num_gpus
	BATCH_SIZE = 16


	# update variables for quick run option
	if QUICK_RUN:
		SAMPLE_RATIO = 0.1
		NUM_TRAIN_EPOCHS = 1


	# download data
	file_name = DATA_URL.split("/")[-1]  # a name for the downloaded file
	maybe_download(DATA_URL, file_name, DATA_PATH)
	data_file = os.path.join(DATA_PATH, file_name)

	# parse CoNll file
	sentence_list, labels_list = read_conll_file(data_file, sep=" ", encoding='utf-8')

	# sub-sample (optional)
	random.seed(RANDOM_SEED)
	sample_size = int(SAMPLE_RATIO * len(sentence_list))
	sentence_list, labels_list = list(
		zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))
	)

	# train-test split
	train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(
		sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED
	)

	processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)


	label_map = TokenClassificationProcessor.create_label_map(
		label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG
	)

	train_dataset = processor.preprocess(
		text=train_sentence_list,
		max_len=MAX_SEQ_LENGTH,
		labels=train_labels_list,
		label_map=label_map,
		trailing_piece_tag=TRAILING_PIECE_TAG,
	)

	# train_data_loader = DataLoader(train_dataset)
	test_dataset = processor.preprocess(
		text=test_sentence_list,
		max_len=MAX_SEQ_LENGTH,
		labels=test_labels_list,
		label_map=label_map,
		trailing_piece_tag=TRAILING_PIECE_TAG,
	)

	torch.save(train_dataset, os.path.join(DATA_PATH, 'train.pt'))
	torch.save(test_dataset, os.path.join(DATA_PATH, 'test.pt'))
	torch.save(label_map, os.path.join(DATA_PATH, 'label_map.pt'))

	# Default datastore
	def_data_store = ws.get_default_datastore()

	# Get the blob storage associated with the workspace
	def_blob_store = Datastore(ws, "workspaceblobstore")

	# Get file storage associated with the workspace
	def_file_store = Datastore(ws, "workspacefilestore")

	try:
		def_blob_store.upload_files(
	    			[os.path.join(DATA_PATH, 'train.pt')], target_path="nerdata", overwrite=True, show_progress=True)
	except Exception as e:
		print(f"Failed to upload -> {e}")

	try:
		def_blob_store.upload_files(
                    [os.path.join(DATA_PATH, 'test.pt')], target_path="nerdata", overwrite=True, show_progress=True)
	except Exception as e:
		print(f"Failed to upload -> {e}")

	try:
		def_blob_store.upload_files(
                    [os.path.join(DATA_PATH, 'label_map.pt')], target_path="nerdata", overwrite=True, show_progress=True)
	except Exception as e:
		print(f"Failed to upload -> {e}")

	train_datastore_paths = [(def_blob_store, 'nerdata/train.pt')]
	test_datastore_paths = [(def_blob_store, 'nerdata/test.pt')]
	label_map_datastore_paths = [(def_blob_store, 'nerdata/label_map.pt')]

	# def_blob_store.upload(src_dir=DATA_PATH, target_path="nerdata", overwrite=True, show_progress=True)

	train_ds = Dataset.File.from_files(path=train_datastore_paths)
	test_ds = Dataset.File.from_files(path=test_datastore_paths)
	label_map_ds = Dataset.File.from_files(path=label_map_datastore_paths)

	train_ds = train_ds.register(workspace=ws,
                                  name='ner_bert_train_ds',
                                  description='Named Entity Recognition with BERT (Training set)',
                                  create_new_version=False)

	test_ds = test_ds.register(workspace=ws,
                                  name='ner_bert_test_ds',
                                  description='Named Entity Recognition with BERT (Testing set)',
                                  create_new_version=False)

	label_map_ds = label_map_ds.register(workspace=ws,
                            name='ner_bert_label_map_ds_ds',
                                  description='Named Entity Recognition with BERT (Testing set)',
                                  create_new_version=False)

	train_dataloader = dataloader_from_dataset(
		train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False
	)

	test_dataloader = dataloader_from_dataset(
		test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False
	)

	return (train_dataloader, test_dataloader, label_map)
# Default datastore 
def_data_store = ws.get_default_datastore()

# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")

# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")


# In[123]:


def_blob_store.upload_files(
    ["Downloads/005930.KS.csv"],
    target_path="xyz",
    overwrite=True)


# In[ ]:


def_blob_store.upload_files(
    ["./data/20news.pkl"],
    target_path="20newsgroups",
    overwrite=True)


# In[129]: