def test_quick_query_iter_records_with_nonfatal_error_ignore( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=True) output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='$', escapechar='\\', ) resp = file_client.query_file("SELECT RepoPath from BlobStorage", file_format=input_format, output_format=output_format) data = list(resp.records()) self.assertEqual(len(resp), len(CSV_DATA)) self.assertEqual(len(data), 32)
def test_quick_query_readall_with_serialization_setting( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) errors = [] def on_error(error): errors.append(error) input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=False) output_format = DelimitedTextDialect(delimiter=';', quotechar="'", lineterminator='.', escapechar='\\') resp = file_client.query_file("SELECT * from BlobStorage", on_error=on_error, file_format=input_format, output_format=output_format) query_result = resp.readall() self.assertEqual(len(errors), 0) self.assertEqual(len(resp), len(CSV_DATA)) self.assertEqual(query_result, CONVERTED_CSV_DATA)
def test_quick_query_iter_records_with_serialization_setting( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=False) output_format = DelimitedTextDialect(delimiter=';', quotechar="'", lineterminator='%', escapechar='\\') reader = file_client.query_file("SELECT * from BlobStorage", file_format=input_format, output_format=output_format) data = [] for record in reader.records(): if record: data.append(record) self.assertEqual(len(reader), len(CSV_DATA)) self.assertEqual(len(reader), reader._blob_query_reader._bytes_processed) self.assertEqual(len(data), 33)
def test_quick_query_iter_output_records_excluding_headers( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(has_header=True) output_format = DelimitedTextDialect(has_header=False) reader = file_client.query_file("SELECT * from BlobStorage", file_format=input_format, output_format=output_format) read_records = reader.records() # Assert first line does not include header data = next(read_records) self.assertEqual( data, b'App Configuration,azure-data-appconfiguration,1,appconfiguration,FALSE' ) for record in read_records: data += record self.assertEqual(len(reader), len(CSV_DATA)) self.assertEqual(len(reader), reader._blob_query_reader._bytes_processed) self.assertEqual(data, CSV_DATA.replace(b'\r\n', b'')[44:])
def test_quick_query_readall_with_nonfatal_error_ignore(self): # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect( delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=True ) output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='.', escapechar='\\', ) resp = file_client.query_file( "SELECT RepoPath from BlobStorage", file_format=input_format, output_format=output_format) query_result = resp.readall() self.assertEqual(len(resp), len(CSV_DATA)) self.assertTrue(len(query_result) > 0)
def test_quick_query_iter_records_with_nonfatal_error_handler(self): # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) errors = [] def on_error(error): errors.append(error) input_format = DelimitedTextDialect( delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=True ) output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='%', escapechar='\\', ) resp = file_client.query_file( "SELECT RepoPath from BlobStorage", file_format=input_format, output_format=output_format, on_error=on_error) data = list(resp.records()) # the error is because that line only has one column self.assertEqual(len(errors), 1) self.assertEqual(len(resp), len(CSV_DATA)) self.assertEqual(len(data), 32)
def test_quick_query_iter_records_with_fatal_error_ignore( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange data1 = b'{name: owner}' data2 = b'{name2: owner2}' data3 = b'{version:0,begin:1601-01-01T00:00:00.000Z,intervalSecs:3600,status:Finalized,config:' \ b'{version:0,configVersionEtag:0x8d75ef460eb1a12,numShards:1,recordsFormat:avro,formatSchemaVersion:3,' \ b'shardDistFnVersion:1},chunkFilePaths:[$blobchangefeed/log/00/1601/01/01/0000/],storageDiagnostics:' \ b'{version:0,lastModifiedTime:2019-11-01T17:53:18.861Z,' \ b'data:{aid:d305317d-a006-0042-00dd-902bbb06fc56}}}' data = data1 + b'\n' + data2 + b'\n' + data1 # upload the json file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(data, overwrite=True) input_format = DelimitedJsonDialect() output_format = DelimitedTextDialect(delimiter=';', quotechar="'", lineterminator='.', escapechar='\\') resp = file_client.query_file("SELECT * from BlobStorage", file_format=input_format, output_format=output_format) for record in resp.records(): print(record)
def test_quick_query_iter_output_records_including_headers( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(has_header=True) reader = file_client.query_file("SELECT * from BlobStorage", file_format=input_format) read_records = reader.records() # Assert first line does not include header data = next(read_records) self.assertEqual(data, b'Service,Package,Version,RepoPath,MissingDocs') for record in read_records: data += record self.assertEqual(len(reader), len(CSV_DATA)) self.assertEqual(len(reader), reader._blob_query_reader._bytes_processed) self.assertEqual(data, CSV_DATA.replace(b'\r\n', b''))
def test_quick_query_datalake_expression(self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange # upload the csv file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(DATALAKE_CSV_DATA, overwrite=True) errors = [] def on_error(error): errors.append(error) input_format = DelimitedTextDialect(has_header=True) reader = file_client.query_file( "SELECT DataLakeStorage from DataLakeStorage", on_error=on_error, file_format=input_format) reader.readall() self.assertEqual(len(errors), 0) self.assertEqual(len(reader), len(DATALAKE_CSV_DATA)) self.assertEqual(len(reader), reader._blob_query_reader._bytes_processed)
def test_quick_query_readall_with_fatal_error_handler_raise(self): # Arrange data1 = b'{name: owner}' data2 = b'{name2: owner2}' data3 = b'{version:0,begin:1601-01-01T00:00:00.000Z,intervalSecs:3600,status:Finalized,config:' \ b'{version:0,configVersionEtag:0x8d75ef460eb1a12,numShards:1,recordsFormat:avro,formatSchemaVersion:3,' \ b'shardDistFnVersion:1},chunkFilePaths:[$blobchangefeed/log/00/1601/01/01/0000/],storageDiagnostics:' \ b'{version:0,lastModifiedTime:2019-11-01T17:53:18.861Z,' \ b'data:{aid:d305317d-a006-0042-00dd-902bbb06fc56}}}' data = data1 + b'\n' + data2 + b'\n' + data1 # upload the json file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(data, overwrite=True) errors = [] def on_error(error): raise Exception(error.description) input_format = DelimitedJsonDialect() output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='.', escapechar='\\' ) resp = file_client.query_file( "SELECT * from BlobStorage", on_error=on_error, file_format=input_format, output_format=output_format) with pytest.raises(Exception): query_result = resp.readall()
def main(): try: CONNECTION_STRING = os.environ['AZURE_STORAGE_CONNECTION_STRING'] except KeyError: print("AZURE_STORAGE_CONNECTION_STRING must be set.") sys.exit(1) datalake_service_client = DataLakeServiceClient.from_connection_string( CONNECTION_STRING) filesystem_name = "quickqueryfilesystem" filesystem_client = datalake_service_client.get_file_system_client( filesystem_name) try: filesystem_client.create_file_system() except: pass # [START query] errors = [] def on_error(error): errors.append(error) # upload the csv file file_client = datalake_service_client.get_file_client( filesystem_name, "csvfile") file_client.upload_data(CSV_DATA, overwrite=True) # select the second column of the csv file query_expression = "SELECT _2 from DataLakeStorage" input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar="", has_header=False) output_format = DelimitedJsonDialect(delimiter='\n') reader = file_client.query_file(query_expression, on_error=on_error, file_format=input_format, output_format=output_format) content = reader.readall() # [END query] print(content) filesystem_client.delete_file_system()
def test_quick_query_readall_with_fatal_error_ignore( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange data1 = b'{name: owner}' data2 = b'{name2: owner2}' data = data1 + b'\n' + data2 + b'\n' + data1 # upload the json file file_name = self._get_file_reference() file_client = self.dsc.get_file_client(self.filesystem_name, file_name) file_client.upload_data(data, overwrite=True) input_format = DelimitedJsonDialect() output_format = DelimitedTextDialect(delimiter=';', quotechar="'", lineterminator='.', escapechar='\\') resp = file_client.query_file("SELECT * from BlobStorage", file_format=input_format, output_format=output_format) query_result = resp.readall()