def setUp(self):
        self.workspace = Workspace(settings.workspace.id, settings.workspace.token, settings.workspace.endpoint)

        self.original_data = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
        self.original_dataframe = pd.DataFrame(self.original_data)
        self.original_name = "unittestcsvwh" + id_generator()
        self.original_description = "safe to be deleted - " + self.original_name

        self.updated_data = [{"a": 101, "b": 102}, {"a": 105, "b": 110, "c": 120}]
        self.updated_dataframe = pd.DataFrame(self.updated_data)
        self.updated_name = "unittestcsvwhupdate" + id_generator()
        self.updated_description = "updated"
Beispiel #2
0
    def setUp(self):
        self.workspace = Workspace(
            settings.workspace.id,
            settings.workspace.token,
            settings.workspace.endpoint
        )

        self.original_data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
        self.original_dataframe = pd.DataFrame(self.original_data)
        self.original_name = 'unittestcsvwh' + id_generator()
        self.original_description = 'safe to be deleted - ' + self.original_name

        self.updated_data = [{'a': 101, 'b': 102}, {'a': 105, 'b': 110, 'c': 120}]
        self.updated_dataframe = pd.DataFrame(self.updated_data)
        self.updated_name = 'unittestcsvwhupdate' + id_generator()
        self.updated_description = 'updated'
    def test_azureml_example_datasets(self):
        max_size = 10 * 1024 * 1024
        skip = [
            'Restaurant feature data',
            'IMDB Movie Titles',
            'Book Reviews from Amazon',
        ]

        for dataset in self.workspace.example_datasets:
            if not hasattr(dataset, 'to_dataframe'):
                print('skipped (unsupported format): {0}'.format(dataset.name))
                continue

            if dataset.size > max_size:
                print('skipped (max size): {0}'.format(dataset.name))
                continue

            if dataset.name in skip:
                print('skipped: {0}'.format(dataset.name))
                continue

            print('downloading: ' + dataset.name)
            frame = dataset.to_dataframe()

            print('uploading: ' + dataset.name)
            dataset_name = 'unittest' + dataset.name + id_generator()
            description = 'safe to be deleted - ' + dataset_name
            self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)
    def test_add_from_dataframe_invalid_name(self):
        # Arrange
        invalid_name = "unittestcsvwh:" + id_generator()

        # Act
        try:
            result = self.workspace.datasets.add_from_dataframe(
                self.original_dataframe, DataTypeIds.GenericCSV, invalid_name, self.original_description
            )
            self.assertTrue(False, "Failed to raise AzureMLHttpError.")
        except AzureMLHttpError as error:
            self.assertIn("forbidden characters", str(error))
            self.assertEqual(error.status_code, 400)
    def test_add_from_dataframe_invalid_name(self):
        # Arrange
        invalid_name = 'unittestcsvwh:' + id_generator()

        # Act
        try:
            result = self.workspace.datasets.add_from_dataframe(
                self.original_dataframe,
                DataTypeIds.GenericCSV,
                invalid_name,
                self.original_description,
            )
            self.assertTrue(False, 'Failed to raise AzureMLHttpError.')
        except AzureMLHttpError as error:
            self.assertIn('forbidden characters', str(error))
            self.assertEqual(error.status_code, 400)
    def test_add_from_raw_data_chunked(self):
        original_name = 'unittestcsvwh' + id_generator()

        # Arrange
        original_raw_data = b''.join(chr(random.randint(0, 255)) for x in range(0x800000))

        # Act
        result = self.workspace.datasets.add_from_raw_data(
            original_raw_data,
            DataTypeIds.GenericCSV,
            original_name,
            'test description',
        )

        # Assert
        self.assertIsNotNone(result)
        self.assertIsNotNone(self.workspace.datasets[original_name])
        self.assertEqual(result.name, original_name)

        new_data = self.workspace.datasets[original_name].read_as_binary()
        self.assertEqual(original_raw_data, new_data)
    def test_add_from_raw_data_chunked(self):
        original_name = 'unittestcsvwh' + id_generator()

        # Arrange
        original_raw_data = b''.join(
            chr(random.randint(0, 255)) for x in range(0x800000))

        # Act
        result = self.workspace.datasets.add_from_raw_data(
            original_raw_data,
            DataTypeIds.GenericCSV,
            original_name,
            'test description',
        )

        # Assert
        self.assertIsNotNone(result)
        self.assertIsNotNone(self.workspace.datasets[original_name])
        self.assertEqual(result.name, original_name)

        new_data = self.workspace.datasets[original_name].read_as_binary()
        self.assertEqual(original_raw_data, new_data)
    def test_azureml_example_datasets(self):
        max_size = 10 * 1024 * 1024
        skip = ["Restaurant feature data", "IMDB Movie Titles", "Book Reviews from Amazon"]

        for dataset in self.workspace.example_datasets:
            if not hasattr(dataset, "to_dataframe"):
                print("skipped (unsupported format): {0}".format(dataset.name))
                continue

            if dataset.size > max_size:
                print("skipped (max size): {0}".format(dataset.name))
                continue

            if dataset.name in skip:
                print("skipped: {0}".format(dataset.name))
                continue

            print("downloading: " + dataset.name)
            frame = dataset.to_dataframe()

            print("uploading: " + dataset.name)
            dataset_name = "unittest" + dataset.name + id_generator()
            description = "safe to be deleted - " + dataset_name
            self.workspace.datasets.add_from_dataframe(frame, dataset.data_type_id, dataset_name, description)
    def test_download_blob_then_upload_as_dataframe_then_read_dataset(self):
        def datatypeid_from_header_and_format(header, format):
            if format == "csv":
                if header == "wh":
                    return DataTypeIds.GenericCSV
                else:
                    return DataTypeIds.GenericCSVNoHeader
            elif format == "tsv":
                if header == "wh":
                    return DataTypeIds.GenericTSV
                else:
                    return DataTypeIds.GenericTSVNoHeader
            elif format == "txt":
                return DataTypeIds.PlainText
            else:
                self.assertTrue(False, "Unexpected format")

        def split_blob_name(blob_name):
            # blob naming convention:
            # name_<header>.<format>
            # <header>: WH: with header
            #           NH: no header
            # <format>: CSV: comma separated
            #           TSV: tab separated
            #           TXT: newline separated
            name, format = blob_name.lower().split(".")
            if format != "txt":
                name, header = name.split("_")
            else:
                header = "nh"

            return name, format, header

        for blob_name in settings.storage.blobs:
            print(blob_name)

            name, format, header = split_blob_name(blob_name)

            # Read the data from blob storage
            original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name)
            self._write_blob_contents(blob_name, original_data)

            # Parse the data to a dataframe using Pandas
            original_dataframe = pd.read_csv(
                BytesIO(original_data),
                header=0 if header == "wh" else None,
                sep="," if format == "csv" else "\t" if format == "tsv" else "\n",
                encoding="utf-8-sig",
            )

            # Upload the dataframe as a new dataset
            dataset_name = "unittest" + name + id_generator()
            description = "safe to be deleted - " + dataset_name
            data_type_id = datatypeid_from_header_and_format(header, format)
            self.workspace.datasets.add_from_dataframe(original_dataframe, data_type_id, dataset_name, description)

            # Get the new dataset
            dataset = self.workspace.datasets[dataset_name]
            self.assertIsNotNone(dataset)

            # Read the dataset as a dataframe
            result_data = dataset.read_as_binary()
            self._write_serialized_frame(blob_name, result_data)
            result_dataframe = dataset.to_dataframe()

            # Verify that the dataframes are equal
            assert_frame_equal(original_dataframe, result_dataframe)
Beispiel #10
0
    def test_download_blob_then_upload_as_dataframe_then_read_dataset(self):
        def datatypeid_from_header_and_format(header, format):
            if format == 'csv':
                if header == 'wh':
                    return DataTypeIds.GenericCSV
                else:
                    return DataTypeIds.GenericCSVNoHeader
            elif format == 'tsv':
                if header == 'wh':
                    return DataTypeIds.GenericTSV
                else:
                    return DataTypeIds.GenericTSVNoHeader
            elif format == 'txt':
                return DataTypeIds.PlainText
            else:
                self.assertTrue(False, 'Unexpected format')

        def split_blob_name(blob_name):
            # blob naming convention:
            # name_<header>.<format>
            # <header>: WH: with header
            #           NH: no header
            # <format>: CSV: comma separated
            #           TSV: tab separated
            #           TXT: newline separated
            name, format = blob_name.lower().split('.')
            if format != 'txt':
                name, header = name.split('_')
            else:
                header = 'nh'

            return name, format, header

        for blob_name in settings.storage.blobs:
            print(blob_name)

            name, format, header = split_blob_name(blob_name)

            # Read the data from blob storage
            original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name)
            self._write_blob_contents(blob_name, original_data)

            # Parse the data to a dataframe using Pandas
            original_dataframe = pd.read_csv(
                BytesIO(original_data),
                header=0 if header == 'wh' else None,
                sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n',
                encoding='utf-8-sig'
            )

            # Upload the dataframe as a new dataset
            dataset_name = 'unittest' + name + id_generator()
            description = 'safe to be deleted - ' + dataset_name
            data_type_id = datatypeid_from_header_and_format(header, format)
            self.workspace.datasets.add_from_dataframe(
                original_dataframe,
                data_type_id,
                dataset_name,
                description,
            )

            # Get the new dataset
            dataset = self.workspace.datasets[dataset_name]
            self.assertIsNotNone(dataset)

            # Read the dataset as a dataframe
            result_data = dataset.read_as_binary()
            self._write_serialized_frame(blob_name, result_data)
            result_dataframe = dataset.to_dataframe()

            # Verify that the dataframes are equal
            assert_frame_equal(original_dataframe, result_dataframe)