def test_quick_query_iter_records_with_nonfatal_error_ignore(self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient( self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) # upload the csv file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect( delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=True ) output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='$', escapechar='\\', ) resp = blob_client.query_blob( "SELECT RepoPath from BlobStorage", blob_format=input_format, output_format=output_format) data = list(resp.records()) self.assertEqual(resp._size, len(CSV_DATA)) self.assertEqual(len(data), 32) self._teardown(bsc)
def test_quick_query_iter_records_with_serialization_setting( self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient(self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) # upload the csv file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=False) output_format = DelimitedTextDialect(delimiter=';', quotechar="'", lineterminator='%', escapechar='\\') reader = blob_client.query_blob("SELECT * from BlobStorage", blob_format=input_format, output_format=output_format) data = [] for record in reader.records(): if record: data.append(record) self.assertEqual(len(reader), len(CSV_DATA)) self.assertEqual(reader._size, reader._bytes_processed) self.assertEqual(len(data), 33) self._teardown(bsc)
def test_quick_query_iter_output_records_excluding_headers( self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient(self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) # upload the csv file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(has_header=True) output_format = DelimitedTextDialect(has_header=False) reader = blob_client.query_blob("SELECT * from BlobStorage", blob_format=input_format, output_format=output_format) read_records = reader.records() # Assert first line does not include header data = next(read_records) self.assertEqual( data, b'App Configuration,azure-data-appconfiguration,1,appconfiguration,FALSE' ) for record in read_records: data += record self.assertEqual(len(reader), len(CSV_DATA)) self.assertEqual(reader._size, reader._bytes_processed) self.assertEqual(data, CSV_DATA.replace(b'\r\n', b'')[44:]) self._teardown(bsc)
def test_quick_query_readall_with_fatal_error_ignore(self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient( self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) data1 = b'{name: owner}' data2 = b'{name2: owner2}' data = data1 + b'\n' + data2 + b'\n' + data1 # upload the json file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(data, overwrite=True) input_format = DelimitedJsonDialect() output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='.', escapechar='\\' ) resp = blob_client.query_blob( "SELECT * from BlobStorage", blob_format=input_format, output_format=output_format) query_result = resp.readall() self._teardown(bsc)
def test_quick_query_iter_records_with_fatal_error_ignore( self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient(self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) data1 = b'{name: owner}' data2 = b'{name2: owner2}' data3 = b'{version:0,begin:1601-01-01T00:00:00.000Z,intervalSecs:3600,status:Finalized,config:' \ b'{version:0,configVersionEtag:0x8d75ef460eb1a12,numShards:1,recordsFormat:avro,formatSchemaVersion:3,' \ b'shardDistFnVersion:1},chunkFilePaths:[$blobchangefeed/log/00/1601/01/01/0000/],storageDiagnostics:' \ b'{version:0,lastModifiedTime:2019-11-01T17:53:18.861Z,' \ b'data:{aid:d305317d-a006-0042-00dd-902bbb06fc56}}}' data = data1 + b'\n' + data2 + b'\n' + data1 # upload the json file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(data, overwrite=True) input_format = DelimitedJsonDialect() output_format = DelimitedTextDialect(delimiter=';', quotechar="'", lineterminator='.', escapechar='\\') resp = blob_client.query_blob("SELECT * from BlobStorage", blob_format=input_format, output_format=output_format) for record in resp.records(): print(record) self._teardown(bsc)
def test_quick_query_iter_output_records_including_headers(self, storage_account_name, storage_account_key): # Arrange bsc = BlobServiceClient( self.account_url(storage_account_name, "blob"), credential=storage_account_key) self._setup(bsc) # upload the csv file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(CSV_DATA, overwrite=True) input_format = DelimitedTextDialect(has_header=True) reader = blob_client.query_blob("SELECT * from BlobStorage", blob_format=input_format) read_records = reader.records() # Assert first line does not include header data = next(read_records) self.assertEqual(data, b'Service,Package,Version,RepoPath,MissingDocs') for record in read_records: data += record self.assertEqual(len(reader), len(CSV_DATA)) self.assertEqual(reader._size, reader._bytes_processed) self.assertEqual(data, CSV_DATA.replace(b'\r\n', b'')) self._teardown(bsc)
def test_quick_query_readall_with_nonfatal_error_handler(self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient( self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) # upload the csv file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(CSV_DATA, overwrite=True) errors = [] def on_error(error): errors.append(error) input_format = DelimitedTextDialect( delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=True ) output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='.', escapechar='\\', ) resp = blob_client.query_blob( "SELECT RepoPath from BlobStorage", blob_format=input_format, output_format=output_format, on_error=on_error) query_result = resp.readall() # the error is because that line only has one column self.assertEqual(len(errors), 1) self.assertEqual(resp._size, len(CSV_DATA)) self.assertTrue(len(query_result) > 0) self._teardown(bsc)
def test_quick_query_readall_with_serialization_setting(self, resource_group, location, storage_account, storage_account_key): # Arrange bsc = BlobServiceClient( self.account_url(storage_account, "blob"), credential=storage_account_key) self._setup(bsc) # upload the csv file blob_name = self._get_blob_reference() blob_client = bsc.get_blob_client(self.container_name, blob_name) blob_client.upload_blob(CSV_DATA, overwrite=True) errors = [] def on_error(error): errors.append(error) input_format = DelimitedTextDialect( delimiter=',', quotechar='"', lineterminator='\n', escapechar='', has_header=False ) output_format = DelimitedTextDialect( delimiter=';', quotechar="'", lineterminator='.', escapechar='\\' ) resp = blob_client.query_blob( "SELECT * from BlobStorage", on_error=on_error, blob_format=input_format, output_format=output_format) query_result = resp.readall() self.assertEqual(len(errors), 0) self.assertEqual(resp._size, len(CSV_DATA)) self.assertEqual(query_result, CONVERTED_CSV_DATA) self._teardown(bsc)
def main(): try: CONNECTION_STRING = os.environ['AZURE_STORAGE_CONNECTION_STRING'] except KeyError: print("AZURE_STORAGE_CONNECTION_STRING must be set.") sys.exit(1) blob_service_client = BlobServiceClient.from_connection_string( CONNECTION_STRING) container_name = "quickquerycontainer" container_client = blob_service_client.get_container_client(container_name) try: container_client.create_container() except: pass # [START query] errors = [] def on_error(error): errors.append(error) # upload the csv file blob_client = blob_service_client.get_blob_client(container_name, "csvfile") with open("./sample-blobs/quick_query.csv", "rb") as stream: blob_client.upload_blob(stream, overwrite=True) # select the second column of the csv file query_expression = "SELECT _2 from BlobStorage" input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar="", has_header=False) output_format = DelimitedJsonDialect(delimiter='\n') reader = blob_client.query_blob(query_expression, on_error=on_error, blob_format=input_format, output_format=output_format) content = reader.readall() # [END query] print(content) container_client.delete_container()
def query_a_csv_blob(a_query, a_blob_url): blob_client = BlobClient.from_blob_url(blob_url=a_blob_url) qa_reader = blob_client.query_blob( a_query, blob_format=DelimitedTextDialect(has_header=True), encoding='utf-8') return csv.reader(qa_reader.records()) #In the future refactor to class # class nodbdb_client(Resource): # def get(self): # return "'cause 'tis a storage" # def query_a_blob(self, a_query, a_blob_url): # return 0 # def insert_a_blob(self, a_blob, a_path): # return 0
def query(a_query=store_conn['query_sql'], a_blob_url=store_conn['file_csv'], a_sas_key=store_conn['access_key']): """ Helper to query json and/or csv files on a Blob/Datalake """ result_set = [] start = time.perf_counter() #Get the file extension/type a_file_name, a_file_type = get_ext(a_blob_url) blob_client = BlobClient.from_blob_url(blob_url=a_blob_url + a_sas_key) if a_file_type == '.csv': qa_reader = blob_client.query_blob( a_query, blob_format=DelimitedTextDialect(has_header=True), encoding='utf-8') elif a_file_type == '.json': qa_reader = blob_client.query_blob( a_query, blob_format=DelimitedJsonDialect(delimeter=' '), encoding='utf-8', output_format=DelimitedJsonDialect(delimiter='\n')) elif a_file_type == '.parquet': qa_reader = None print("We'll do something about this") else: qa_reader = None print(f"Sorry, can't query a {a_file_type} file type") end = time.perf_counter() #Show (sarcastic voice) *usefully accurate* elapsed seconds and return records print(f"Time taken to get results {end - start} seconds") if qa_reader is None: print("No result found. Sorry human, better luck nextime ¯\_(ツ)_/¯") else: for row in qa_reader.records(): if row: result_set.append(row) return result_set