Exemple #1
0
    def test_quick_query_iter_records_with_nonfatal_error_ignore(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        input_format = DelimitedTextDialect(delimiter=',',
                                            quotechar='"',
                                            lineterminator='\n',
                                            escapechar='',
                                            has_header=True)
        output_format = DelimitedTextDialect(
            delimiter=';',
            quotechar="'",
            lineterminator='$',
            escapechar='\\',
        )
        resp = file_client.query_file("SELECT RepoPath from BlobStorage",
                                      file_format=input_format,
                                      output_format=output_format)
        data = list(resp.records())
        self.assertEqual(len(resp), len(CSV_DATA))
        self.assertEqual(len(data), 32)
Exemple #2
0
    def test_quick_query_readall_with_serialization_setting(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        errors = []

        def on_error(error):
            errors.append(error)

        input_format = DelimitedTextDialect(delimiter=',',
                                            quotechar='"',
                                            lineterminator='\n',
                                            escapechar='',
                                            has_header=False)
        output_format = DelimitedTextDialect(delimiter=';',
                                             quotechar="'",
                                             lineterminator='.',
                                             escapechar='\\')
        resp = file_client.query_file("SELECT * from BlobStorage",
                                      on_error=on_error,
                                      file_format=input_format,
                                      output_format=output_format)
        query_result = resp.readall()

        self.assertEqual(len(errors), 0)
        self.assertEqual(len(resp), len(CSV_DATA))
        self.assertEqual(query_result, CONVERTED_CSV_DATA)
Exemple #3
0
    def test_quick_query_iter_records_with_serialization_setting(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        input_format = DelimitedTextDialect(delimiter=',',
                                            quotechar='"',
                                            lineterminator='\n',
                                            escapechar='',
                                            has_header=False)
        output_format = DelimitedTextDialect(delimiter=';',
                                             quotechar="'",
                                             lineterminator='%',
                                             escapechar='\\')

        reader = file_client.query_file("SELECT * from BlobStorage",
                                        file_format=input_format,
                                        output_format=output_format)
        data = []
        for record in reader.records():
            if record:
                data.append(record)

        self.assertEqual(len(reader), len(CSV_DATA))
        self.assertEqual(len(reader),
                         reader._blob_query_reader._bytes_processed)
        self.assertEqual(len(data), 33)
Exemple #4
0
    def test_quick_query_iter_output_records_excluding_headers(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        input_format = DelimitedTextDialect(has_header=True)
        output_format = DelimitedTextDialect(has_header=False)
        reader = file_client.query_file("SELECT * from BlobStorage",
                                        file_format=input_format,
                                        output_format=output_format)
        read_records = reader.records()

        # Assert first line does not include header
        data = next(read_records)
        self.assertEqual(
            data,
            b'App Configuration,azure-data-appconfiguration,1,appconfiguration,FALSE'
        )

        for record in read_records:
            data += record

        self.assertEqual(len(reader), len(CSV_DATA))
        self.assertEqual(len(reader),
                         reader._blob_query_reader._bytes_processed)
        self.assertEqual(data, CSV_DATA.replace(b'\r\n', b'')[44:])
    def test_quick_query_readall_with_nonfatal_error_ignore(self):
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        input_format = DelimitedTextDialect(
            delimiter=',',
            quotechar='"',
            lineterminator='\n',
            escapechar='',
            has_header=True
        )
        output_format = DelimitedTextDialect(
            delimiter=';',
            quotechar="'",
            lineterminator='.',
            escapechar='\\',
        )
        resp = file_client.query_file(
            "SELECT RepoPath from BlobStorage",
            file_format=input_format,
            output_format=output_format)
        query_result = resp.readall()
        self.assertEqual(len(resp), len(CSV_DATA))
        self.assertTrue(len(query_result) > 0)
    def test_quick_query_iter_records_with_nonfatal_error_handler(self):
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        errors = []
        def on_error(error):
            errors.append(error)

        input_format = DelimitedTextDialect(
            delimiter=',',
            quotechar='"',
            lineterminator='\n',
            escapechar='',
            has_header=True
        )
        output_format = DelimitedTextDialect(
            delimiter=';',
            quotechar="'",
            lineterminator='%',
            escapechar='\\',
        )
        resp = file_client.query_file(
            "SELECT RepoPath from BlobStorage",
            file_format=input_format,
            output_format=output_format,
            on_error=on_error)
        data = list(resp.records())

        # the error is because that line only has one column
        self.assertEqual(len(errors), 1)
        self.assertEqual(len(resp), len(CSV_DATA))
        self.assertEqual(len(data), 32)
Exemple #7
0
    def test_quick_query_iter_records_with_fatal_error_ignore(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        data1 = b'{name: owner}'
        data2 = b'{name2: owner2}'
        data3 = b'{version:0,begin:1601-01-01T00:00:00.000Z,intervalSecs:3600,status:Finalized,config:' \
                b'{version:0,configVersionEtag:0x8d75ef460eb1a12,numShards:1,recordsFormat:avro,formatSchemaVersion:3,' \
                b'shardDistFnVersion:1},chunkFilePaths:[$blobchangefeed/log/00/1601/01/01/0000/],storageDiagnostics:' \
                b'{version:0,lastModifiedTime:2019-11-01T17:53:18.861Z,' \
                b'data:{aid:d305317d-a006-0042-00dd-902bbb06fc56}}}'
        data = data1 + b'\n' + data2 + b'\n' + data1

        # upload the json file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(data, overwrite=True)

        input_format = DelimitedJsonDialect()
        output_format = DelimitedTextDialect(delimiter=';',
                                             quotechar="'",
                                             lineterminator='.',
                                             escapechar='\\')
        resp = file_client.query_file("SELECT * from BlobStorage",
                                      file_format=input_format,
                                      output_format=output_format)

        for record in resp.records():
            print(record)
Exemple #8
0
    def test_quick_query_iter_output_records_including_headers(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(CSV_DATA, overwrite=True)

        input_format = DelimitedTextDialect(has_header=True)
        reader = file_client.query_file("SELECT * from BlobStorage",
                                        file_format=input_format)
        read_records = reader.records()

        # Assert first line does not include header
        data = next(read_records)
        self.assertEqual(data, b'Service,Package,Version,RepoPath,MissingDocs')

        for record in read_records:
            data += record

        self.assertEqual(len(reader), len(CSV_DATA))
        self.assertEqual(len(reader),
                         reader._blob_query_reader._bytes_processed)
        self.assertEqual(data, CSV_DATA.replace(b'\r\n', b''))
Exemple #9
0
    def test_quick_query_datalake_expression(self,
                                             datalake_storage_account_name,
                                             datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        # upload the csv file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(DATALAKE_CSV_DATA, overwrite=True)

        errors = []

        def on_error(error):
            errors.append(error)

        input_format = DelimitedTextDialect(has_header=True)
        reader = file_client.query_file(
            "SELECT DataLakeStorage from DataLakeStorage",
            on_error=on_error,
            file_format=input_format)
        reader.readall()

        self.assertEqual(len(errors), 0)
        self.assertEqual(len(reader), len(DATALAKE_CSV_DATA))
        self.assertEqual(len(reader),
                         reader._blob_query_reader._bytes_processed)
    def test_quick_query_readall_with_fatal_error_handler_raise(self):
        # Arrange
        data1 = b'{name: owner}'
        data2 = b'{name2: owner2}'
        data3 = b'{version:0,begin:1601-01-01T00:00:00.000Z,intervalSecs:3600,status:Finalized,config:' \
                b'{version:0,configVersionEtag:0x8d75ef460eb1a12,numShards:1,recordsFormat:avro,formatSchemaVersion:3,' \
                b'shardDistFnVersion:1},chunkFilePaths:[$blobchangefeed/log/00/1601/01/01/0000/],storageDiagnostics:' \
                b'{version:0,lastModifiedTime:2019-11-01T17:53:18.861Z,' \
                b'data:{aid:d305317d-a006-0042-00dd-902bbb06fc56}}}'
        data = data1 + b'\n' + data2 + b'\n' + data1

        # upload the json file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(data, overwrite=True)

        errors = []

        def on_error(error):
            raise Exception(error.description)

        input_format = DelimitedJsonDialect()
        output_format = DelimitedTextDialect(
            delimiter=';',
            quotechar="'",
            lineterminator='.',
            escapechar='\\'
        )
        resp = file_client.query_file(
            "SELECT * from BlobStorage",
            on_error=on_error,
            file_format=input_format,
            output_format=output_format)
        with pytest.raises(Exception):
            query_result = resp.readall()
def main():
    try:
        CONNECTION_STRING = os.environ['AZURE_STORAGE_CONNECTION_STRING']

    except KeyError:
        print("AZURE_STORAGE_CONNECTION_STRING must be set.")
        sys.exit(1)

    datalake_service_client = DataLakeServiceClient.from_connection_string(
        CONNECTION_STRING)
    filesystem_name = "quickqueryfilesystem"
    filesystem_client = datalake_service_client.get_file_system_client(
        filesystem_name)
    try:
        filesystem_client.create_file_system()
    except:
        pass
    # [START query]
    errors = []

    def on_error(error):
        errors.append(error)

    # upload the csv file
    file_client = datalake_service_client.get_file_client(
        filesystem_name, "csvfile")
    file_client.upload_data(CSV_DATA, overwrite=True)

    # select the second column of the csv file
    query_expression = "SELECT _2 from DataLakeStorage"
    input_format = DelimitedTextDialect(delimiter=',',
                                        quotechar='"',
                                        lineterminator='\n',
                                        escapechar="",
                                        has_header=False)
    output_format = DelimitedJsonDialect(delimiter='\n')
    reader = file_client.query_file(query_expression,
                                    on_error=on_error,
                                    file_format=input_format,
                                    output_format=output_format)
    content = reader.readall()
    # [END query]
    print(content)

    filesystem_client.delete_file_system()
Exemple #12
0
    def test_quick_query_readall_with_fatal_error_ignore(
            self, datalake_storage_account_name, datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # Arrange
        data1 = b'{name: owner}'
        data2 = b'{name2: owner2}'
        data = data1 + b'\n' + data2 + b'\n' + data1

        # upload the json file
        file_name = self._get_file_reference()
        file_client = self.dsc.get_file_client(self.filesystem_name, file_name)
        file_client.upload_data(data, overwrite=True)

        input_format = DelimitedJsonDialect()
        output_format = DelimitedTextDialect(delimiter=';',
                                             quotechar="'",
                                             lineterminator='.',
                                             escapechar='\\')
        resp = file_client.query_file("SELECT * from BlobStorage",
                                      file_format=input_format,
                                      output_format=output_format)
        query_result = resp.readall()