def setUp(self): self.workspace = Workspace(settings.workspace.id, settings.workspace.token, settings.workspace.endpoint) self.original_data = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}] self.original_dataframe = pd.DataFrame(self.original_data) self.original_name = "unittestcsvwh" + id_generator() self.original_description = "safe to be deleted - " + self.original_name self.updated_data = [{"a": 101, "b": 102}, {"a": 105, "b": 110, "c": 120}] self.updated_dataframe = pd.DataFrame(self.updated_data) self.updated_name = "unittestcsvwhupdate" + id_generator() self.updated_description = "updated"
def setUp(self): self.workspace = Workspace( settings.workspace.id, settings.workspace.token, settings.workspace.endpoint ) self.original_data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] self.original_dataframe = pd.DataFrame(self.original_data) self.original_name = 'unittestcsvwh' + id_generator() self.original_description = 'safe to be deleted - ' + self.original_name self.updated_data = [{'a': 101, 'b': 102}, {'a': 105, 'b': 110, 'c': 120}] self.updated_dataframe = pd.DataFrame(self.updated_data) self.updated_name = 'unittestcsvwhupdate' + id_generator() self.updated_description = 'updated'
def test_azureml_example_datasets(self): max_size = 10 * 1024 * 1024 skip = [ 'Restaurant feature data', 'IMDB Movie Titles', 'Book Reviews from Amazon', ] for dataset in self.workspace.example_datasets: if not hasattr(dataset, 'to_dataframe'): print('skipped (unsupported format): {0}'.format(dataset.name)) continue if dataset.size > max_size: print('skipped (max size): {0}'.format(dataset.name)) continue if dataset.name in skip: print('skipped: {0}'.format(dataset.name)) continue print('downloading: ' + dataset.name) frame = dataset.to_dataframe() print('uploading: ' + dataset.name) dataset_name = 'unittest' + dataset.name + id_generator() description = 'safe to be deleted - ' + dataset_name self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)
def test_add_from_dataframe_invalid_name(self): # Arrange invalid_name = "unittestcsvwh:" + id_generator() # Act try: result = self.workspace.datasets.add_from_dataframe( self.original_dataframe, DataTypeIds.GenericCSV, invalid_name, self.original_description ) self.assertTrue(False, "Failed to raise AzureMLHttpError.") except AzureMLHttpError as error: self.assertIn("forbidden characters", str(error)) self.assertEqual(error.status_code, 400)
def test_add_from_dataframe_invalid_name(self): # Arrange invalid_name = 'unittestcsvwh:' + id_generator() # Act try: result = self.workspace.datasets.add_from_dataframe( self.original_dataframe, DataTypeIds.GenericCSV, invalid_name, self.original_description, ) self.assertTrue(False, 'Failed to raise AzureMLHttpError.') except AzureMLHttpError as error: self.assertIn('forbidden characters', str(error)) self.assertEqual(error.status_code, 400)
def test_add_from_raw_data_chunked(self): original_name = 'unittestcsvwh' + id_generator() # Arrange original_raw_data = b''.join(chr(random.randint(0, 255)) for x in range(0x800000)) # Act result = self.workspace.datasets.add_from_raw_data( original_raw_data, DataTypeIds.GenericCSV, original_name, 'test description', ) # Assert self.assertIsNotNone(result) self.assertIsNotNone(self.workspace.datasets[original_name]) self.assertEqual(result.name, original_name) new_data = self.workspace.datasets[original_name].read_as_binary() self.assertEqual(original_raw_data, new_data)
def test_add_from_raw_data_chunked(self): original_name = 'unittestcsvwh' + id_generator() # Arrange original_raw_data = b''.join( chr(random.randint(0, 255)) for x in range(0x800000)) # Act result = self.workspace.datasets.add_from_raw_data( original_raw_data, DataTypeIds.GenericCSV, original_name, 'test description', ) # Assert self.assertIsNotNone(result) self.assertIsNotNone(self.workspace.datasets[original_name]) self.assertEqual(result.name, original_name) new_data = self.workspace.datasets[original_name].read_as_binary() self.assertEqual(original_raw_data, new_data)
def test_azureml_example_datasets(self): max_size = 10 * 1024 * 1024 skip = ["Restaurant feature data", "IMDB Movie Titles", "Book Reviews from Amazon"] for dataset in self.workspace.example_datasets: if not hasattr(dataset, "to_dataframe"): print("skipped (unsupported format): {0}".format(dataset.name)) continue if dataset.size > max_size: print("skipped (max size): {0}".format(dataset.name)) continue if dataset.name in skip: print("skipped: {0}".format(dataset.name)) continue print("downloading: " + dataset.name) frame = dataset.to_dataframe() print("uploading: " + dataset.name) dataset_name = "unittest" + dataset.name + id_generator() description = "safe to be deleted - " + dataset_name self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)
def test_download_blob_then_upload_as_dataframe_then_read_dataset(self): def datatypeid_from_header_and_format(header, format): if format == "csv": if header == "wh": return DataTypeIds.GenericCSV else: return DataTypeIds.GenericCSVNoHeader elif format == "tsv": if header == "wh": return DataTypeIds.GenericTSV else: return DataTypeIds.GenericTSVNoHeader elif format == "txt": return DataTypeIds.PlainText else: self.assertTrue(False, "Unexpected format") def split_blob_name(blob_name): # blob naming convention: # name_<header>.<format> # <header>: WH: with header # NH: no header # <format>: CSV: comma separated # TSV: tab separated # TXT: newline separated name, format = blob_name.lower().split(".") if format != "txt": name, header = name.split("_") else: header = "nh" return name, format, header for blob_name in settings.storage.blobs: print(blob_name) name, format, header = split_blob_name(blob_name) # Read the data from blob storage original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) self._write_blob_contents(blob_name, original_data) # Parse the data to a dataframe using Pandas original_dataframe = pd.read_csv( BytesIO(original_data), header=0 if header == "wh" else None, sep="," if format == "csv" else "\t" if format == "tsv" else "\n", encoding="utf-8-sig", ) # Upload the dataframe as a new dataset dataset_name = "unittest" + name + id_generator() description = "safe to be deleted - " + dataset_name data_type_id = datatypeid_from_header_and_format(header, format) self.workspace.datasets.add_from_dataframe(original_dataframe, data_type_id, dataset_name, description) # Get the new dataset dataset = self.workspace.datasets[dataset_name] self.assertIsNotNone(dataset) # Read the dataset as a dataframe result_data = dataset.read_as_binary() self._write_serialized_frame(blob_name, result_data) result_dataframe = dataset.to_dataframe() # Verify that the dataframes are equal assert_frame_equal(original_dataframe, result_dataframe)
def test_download_blob_then_upload_as_dataframe_then_read_dataset(self): def datatypeid_from_header_and_format(header, format): if format == 'csv': if header == 'wh': return DataTypeIds.GenericCSV else: return DataTypeIds.GenericCSVNoHeader elif format == 'tsv': if header == 'wh': return DataTypeIds.GenericTSV else: return DataTypeIds.GenericTSVNoHeader elif format == 'txt': return DataTypeIds.PlainText else: self.assertTrue(False, 'Unexpected format') def split_blob_name(blob_name): # blob naming convention: # name_<header>.<format> # <header>: WH: with header # NH: no header # <format>: CSV: comma separated # TSV: tab separated # TXT: newline separated name, format = blob_name.lower().split('.') if format != 'txt': name, header = name.split('_') else: header = 'nh' return name, format, header for blob_name in settings.storage.blobs: print(blob_name) name, format, header = split_blob_name(blob_name) # Read the data from blob storage original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) self._write_blob_contents(blob_name, original_data) # Parse the data to a dataframe using Pandas original_dataframe = pd.read_csv( BytesIO(original_data), header=0 if header == 'wh' else None, sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n', encoding='utf-8-sig' ) # Upload the dataframe as a new dataset dataset_name = 'unittest' + name + id_generator() description = 'safe to be deleted - ' + dataset_name data_type_id = datatypeid_from_header_and_format(header, format) self.workspace.datasets.add_from_dataframe( original_dataframe, data_type_id, dataset_name, description, ) # Get the new dataset dataset = self.workspace.datasets[dataset_name] self.assertIsNotNone(dataset) # Read the dataset as a dataframe result_data = dataset.read_as_binary() self._write_serialized_frame(blob_name, result_data) result_dataframe = dataset.to_dataframe() # Verify that the dataframes are equal assert_frame_equal(original_dataframe, result_dataframe)