def __readFileFromDataLake(self, blobPath): # Todo: Put in config file storage_account_name = self.config["datalakeInfo"][ "storageAccountName"] file_system_name = self.config["datalakeInfo"]["fileSystemName"] client_id = self.config["datalakeInfo"]["clientId"] tenant_id = self.config["datalakeInfo"]["tenantId"] client_secret = self.config["datalakeInfo"]["clientSecret"] try: credential = azure.identity.ClientSecretCredential( tenant_id, client_id, client_secret) service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", storage_account_name), credential=credential) file_system_client = service_client.get_file_system_client( file_system_name) file_client = file_system_client.get_file_client(blobPath) downloaded = file_client.download_file() downloaded_bytes = downloaded.readall() return downloaded_bytes except Exception as e: print(e)
def test_restore_file_system_with_sas(self, datalake_storage_account_name, datalake_storage_account_key): pytest.skip( "We are generating a SAS token therefore play only live but we also need a soft delete enabled account.") self._setUp(datalake_storage_account_name, datalake_storage_account_key) token = generate_account_sas( self.dsc.account_name, self.dsc.credential.account_key, ResourceTypes(service=True, file_system=True), AccountSasPermissions(read=True, write=True, list=True, delete=True), datetime.utcnow() + timedelta(hours=1), ) dsc = DataLakeServiceClient(self.dsc.url, token) name = self._get_file_system_reference(prefix="filesystem") filesystem_client = dsc.create_file_system(name) filesystem_client.delete_file_system() # to make sure the filesystem is deleted with self.assertRaises(ResourceNotFoundError): filesystem_client.get_file_system_properties() filesystem_list = list(dsc.list_file_systems(include_deleted=True)) self.assertTrue(len(filesystem_list) >= 1) restored_version = 0 for filesystem in filesystem_list: # find the deleted filesystem and restore it if filesystem.deleted and filesystem.name == filesystem_client.file_system_name: restored_fs_client = dsc.undelete_file_system(filesystem.name, filesystem.deleted_version, new_name="restored" + name + str(restored_version)) restored_version += 1 # to make sure the deleted filesystem is restored props = restored_fs_client.get_file_system_properties() self.assertIsNotNone(props)
def setUp(self): super(LargeFileTest, self).setUp() url = self._get_account_url() self.payload_dropping_policy = PayloadDroppingPolicy() credential_policy = _format_shared_key_credential( self.settings.STORAGE_DATA_LAKE_ACCOUNT_NAME, self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY) self.dsc = DataLakeServiceClient( url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY, logging_enable=True, _additional_pipeline_policies=[ self.payload_dropping_policy, credential_policy ]) self.config = self.dsc._config self.file_system_name = self.get_resource_name('filesystem') if not self.is_playback(): file_system = self.dsc.get_file_system_client( self.file_system_name) try: file_system.create_file_system(timeout=5) except ResourceExistsError: pass
def setUp(self): super(FileSystemTest, self).setUp() url = self._get_account_url() self.dsc = DataLakeServiceClient( url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY) self.config = self.dsc._config self.test_file_systems = []
def test_read_file_with_user_delegation_key(self): # SAS URL is calculated from storage key, so this test runs live only if TestMode.need_recording_file(self.test_mode): return # Create file file_client = self._create_file_and_return_client() data = self.get_random_bytes(1024) # Upload data to file file_client.append_data(data, 0, len(data)) file_client.flush_data(len(data)) # Get user delegation key token_credential = self.generate_oauth_token() service_client = DataLakeServiceClient(self._get_oauth_account_url(), credential=token_credential) user_delegation_key = service_client.get_user_delegation_key(datetime.utcnow(), datetime.utcnow() + timedelta(hours=1)) sas_token = generate_file_sas(file_client.account_name, file_client.file_system_name, None, file_client.path_name, user_delegation_key, permission=FileSasPermissions(read=True, create=True, write=True, delete=True), expiry=datetime.utcnow() + timedelta(hours=1), ) # doanload the data and make sure it is the same as uploaded data new_file_client = DataLakeFileClient(self._get_account_url(), file_client.file_system_name, file_client.path_name, credential=sas_token) downloaded_data = new_file_client.download_file().readall() self.assertEqual(data, downloaded_data)
def __init__(self): url = self.datalake_account_url() key = self.datalake_account_key() print('url: {}'.format(url)) print('key: {}'.format(key)) self.service_client = DataLakeServiceClient(account_url=url, credential=key) print(self.service_client)
def connect_to_adls(acct_name, acct_key, fs): try: global service_client service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format( "https", acct_name), credential=acct_key) file_system_client = service_client.get_file_system_client(file_system=fs) return file_system_client except Exception as e: print(e) return e
def _setUp(self, account_name, account_key): url = self._get_account_url(account_name) self.dsc = DataLakeServiceClient(url, credential=account_key, logging_enable=True) self.config = self.dsc._config self.filesystem_name = self.get_resource_name('utqqcontainer') if not self.is_playback(): try: self.dsc.create_file_system(self.filesystem_name) except: pass
def setUp(self): super(StorageQuickQueryTest, self).setUp() url = self._get_account_url() self.dsc = DataLakeServiceClient(url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY, logging_enable=True) self.config = self.dsc._config self.filesystem_name = self.get_resource_name('utqqcontainer') if not self.is_playback(): try: self.dsc.create_file_system(self.filesystem_name) except: pass
def __init__(self, container_url, path): storage_account, file_system_name = get_details_from_container_url( container_url) self.container_url = container_url self.path = path self.storage_account = storage_account self.file_system_name = file_system_name storage_config = AzStorageConfig.objects.get( storage_account=storage_account, container_name=file_system_name) self.service = DataLakeServiceClient( f"https://{storage_account}.dfs.core.windows.net/", credential=storage_config.storage_account_key) self.directory_client = self.service.get_directory_client( file_system_name, path)
def setUp(self): super(FileTest, self).setUp() url = self._get_account_url() self.dsc = DataLakeServiceClient(url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY) self.config = self.dsc._config self.file_system_name = self.get_resource_name('filesystem') if not self.is_playback(): file_system = self.dsc.get_file_system_client(self.file_system_name) try: file_system.create_file_system(timeout=5) except ResourceExistsError: pass
def create_connection(self, storage_account_name, storage_account_key, container): try: dfs_url = "{}://{}.dfs.core.windows.net".format( "https", storage_account_name) blob_url = "{}://{}.blob.core.windows.net/".format( "https", storage_account_name) # print("ADLS URL:", dfs_url) # print("Blob URL:", blob_url) self.service_client = DataLakeServiceClient( account_url=dfs_url, credential=storage_account_key) # print("Getting file_system_client...") self.file_system_client = self.service_client.get_file_system_client( file_system=self.settings.storage_container) # print("Getting blob_service_client...") connect_string="DefaultEndpointsProtocol=https;AccountName=" + storage_account_name + ";AccountKey="\ + storage_account_key + ";EndpointSuffix=core.windows.net" self.blob_service_client = BlobServiceClient.from_connection_string( conn_str=connect_string) # Create sas token for blob self.sas_token = generate_account_sas( account_name=self.blob_service_client.account_name, account_key=storage_account_key, resource_types=ResourceTypes(service=True, object=True, container=True), permission=AccountSasPermissions(read=True, write=True, delete=True, list=True, add=True, create=True), start=datetime.now() - timedelta(hours=1), expiry=datetime.utcnow() + timedelta(hours=4) # Token valid for 4 hours ) self.container_client = self.blob_service_client.get_container_client( container) # print("returning references.") return self.service_client\ , self.file_system_client\ , self.blob_service_client\ , self.container_client\ , self.sas_token except Exception as e: print(e) return None, None, None, None, None
def __init__(self, connection_string=os.getenv("AZURE_DT_2"), container_name_="container06"): account_name = os.getenv('STORAGE_ACCOUNT_NAME', "") account_key = os.getenv('STORAGE_ACCOUNT_KEY', "") # set up the service client with the credentials from the environment variables self.service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", account_name), credential=account_key) self.file_system_name = container_name_ self.file_name = file_name_ self.dict_inh = {} self.dict_of_directory = {}
def test_preauthorize_user_with_user_delegation_key( self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # SAS URL is calculated from storage key, so this test runs live only # Create file file_client = self._create_file_and_return_client() data = self.get_random_bytes(1024) # Upload data to file file_client.append_data(data, 0, len(data)) file_client.flush_data(len(data)) file_client.set_access_control( owner="68390a19-a643-458b-b726-408abf67b4fc", permissions='0777') acl = file_client.get_access_control() # Get user delegation key token_credential = self.generate_oauth_token() service_client = DataLakeServiceClient( self._get_account_url(datalake_storage_account_name), credential=token_credential) user_delegation_key = service_client.get_user_delegation_key( datetime.utcnow(), datetime.utcnow() + timedelta(hours=1)) sas_token = generate_file_sas( file_client.account_name, file_client.file_system_name, None, file_client.path_name, user_delegation_key, permission=FileSasPermissions(read=True, write=True, manage_access_control=True, manage_ownership=True), expiry=datetime.utcnow() + timedelta(hours=1), preauthorized_agent_object_id="68390a19-a643-458b-b726-408abf67b4fc" ) # doanload the data and make sure it is the same as uploaded data new_file_client = DataLakeFileClient( self._get_account_url(datalake_storage_account_name), file_client.file_system_name, file_client.path_name, credential=sas_token) acl = new_file_client.set_access_control(permissions='0777') self.assertIsNotNone(acl)
def test_list_system_filesystems(self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # Arrange dsc = DataLakeServiceClient(self.dsc.url, credential=datalake_storage_account_key) # Act filesystems = list(dsc.list_file_systems(include_system=True)) # Assert found = False for fs in filesystems: if fs.name == "$logs": found = True self.assertEqual(found, True)
def file_system_sample(self): # [START create_file_system_client_from_service] # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "mynewfilesystem") # [END create_file_system_client_from_service] try: # [START create_file_system] file_system_client.create_file_system() # [END create_file_system] # [START get_file_system_properties] properties = file_system_client.get_file_system_properties() # [END get_file_system_properties] finally: # [START delete_file_system] file_system_client.delete_file_system()
def _setUp(self, account_name, account_key): url = self._get_account_url(account_name) self.dsc = DataLakeServiceClient(url, credential=account_key, logging_enable=True) self.config = self.dsc._config self.file_system_name = self.get_resource_name('filesystem') if not self.is_playback(): file_system = self.dsc.get_file_system_client( self.file_system_name) try: file_system.create_file_system(timeout=5) except ResourceExistsError: pass
def set_metadata_on_file_system(self): # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "mymetadatafilesystemsync") try: # Create new File System file_system_client.create_file_system() # [START set_file_system_metadata] # Create key, value pairs for metadata metadata = {'type': 'test'} # Set metadata on the file system file_system_client.set_file_system_metadata(metadata=metadata) # [END set_file_system_metadata] # Get file system properties properties = file_system_client.get_file_system_properties() finally: # Delete file system file_system_client.delete_file_system()
def get_file_system_client(self): connect_str = os.environ["ADLS_CONNECTION_STRING"] service_client = DataLakeServiceClient.from_connection_string( connect_str) file_system_client = service_client.get_file_system_client( file_system=self.file_system_name) return file_system_client
def get_directory_client_from_file_system(self): # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "myfilesystem") # Create new File System try: file_system_client.create_file_system() except ResourceExistsError: pass # [START get_directory_client_from_file_system] # Get the DataLakeDirectoryClient from the FileSystemClient to interact with a specific file directory_client = file_system_client.get_directory_client( "mynewdirectory") # [END get_directory_client_from_file_system] # Delete file system file_system_client.delete_file_system()
def acquire_lease_on_file_system(self): # Instantiate a DataLakeServiceClient using a connection string # [START create_data_lake_service_client_from_conn_str] from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # [END create_data_lake_service_client_from_conn_str] # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "myleasefilesystem") # Create new File System try: file_system_client.create_file_system() except ResourceExistsError: pass # [START acquire_lease_on_file_system] # Acquire a lease on the file system lease = file_system_client.acquire_lease() # Delete file system by passing in the lease file_system_client.delete_file_system(lease=lease)
def list_paths_in_file_system(self): # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "myfilesystemforlistpaths") # Create new File System file_system_client.create_file_system() # [START upload_file_to_file_system] with open(SOURCE_FILE, "rb") as data: file_client = file_system_client.get_file_client("myfile") file_client.create_file() file_client.append_data(data, 0) file_client.flush_data(data.tell()) # [END upload_file_to_file_system] # [START get_paths_in_file_system] path_list = file_system_client.get_paths() for path in path_list: print(path.name + '\n') # [END get_paths_in_file_system] # Delete file system file_system_client.delete_file_system()
def initialize_adls(storage_account, client_id, client_secret, tenant_id): """Initialize the connection to an Azure Data Lake Gen 2 Args: storage_account (string): The storage account name client_id (string): The service principal client id client_secret (string): The service principal client secret tenant_id (string): The azure tenant id Returns: [DataLakeServiceClient]: An Azure Data Lake Gen 2 client """ try: credential = ClientSecretCredential(tenant_id, client_id, client_secret) service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", storage_account), credential=credential) return service_client except Exception as e: print(e)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) self.file_system_client = datalake_service_client.get_file_system_client( self.container_name)
def __init__(self): if os.environ.get('AZURE_SUBSCRIPTION_ID') is not None: self.subscription_id = os.environ.get( 'AZURE_SUBSCRIPTION_ID') # your Azure Subscription Id else: raise ValueError( 'AZURE_SUBSCRIPTION_ID environment variable missing') # Sanity check if os.environ.get('AZURE_CLIENT_ID') is None: raise ValueError('AZURE_CLIENT_ID environment variable missing') if os.environ.get('AZURE_CLIENT_SECRET') is None: raise ValueError( 'AZURE_CLIENT_SECRET environment variable missing') if os.environ.get('AZURE_TENANT_ID') is None: raise ValueError('AZURE_TENANT_ID environment variable missing') if os.environ.get('STORAGE_ACCOUNT_NAME') is None: raise ValueError( 'STORAGE_ACCOUNT_NAME environment variable missing') self.storage_account_name = os.environ['STORAGE_ACCOUNT_NAME'] self.credentials = ServicePrincipalCredentials( client_id=os.environ['AZURE_CLIENT_ID'], secret=os.environ['AZURE_CLIENT_SECRET'], tenant=os.environ['AZURE_TENANT_ID']) #FIXME do we need two credentials? self.client_secret_credential = ClientSecretCredential( os.environ['AZURE_TENANT_ID'], os.environ['AZURE_CLIENT_ID'], os.environ['AZURE_CLIENT_SECRET']) self.resource_client = ResourceManagementClient( self.credentials, self.subscription_id) self.msi_client = ManagedServiceIdentityClient(self.credentials, self.subscription_id) self.policy_client = PolicyClient(self.credentials, self.subscription_id) self.authorization_client = AuthorizationManagementClient( self.credentials, self.subscription_id) self.storage_client = StorageManagementClient(self.credentials, self.subscription_id) # adls2 storage client self.adls2_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", self.storage_account_name), credential=self.client_secret_credential)
def test_set_acl_with_user_delegation_key(self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # SAS URL is calculated from storage key, so this test runs live only # Create file file_client = self._create_file_and_return_client() data = self.get_random_bytes(1024) # Upload data to file file_client.append_data(data, 0, len(data)) file_client.flush_data(len(data)) # Get user delegation key token_credential = self.generate_oauth_token() service_client = DataLakeServiceClient( self._get_account_url(datalake_storage_account_name), credential=token_credential) user_delegation_key = service_client.get_user_delegation_key( datetime.utcnow(), datetime.utcnow() + timedelta(hours=1)) sas_token = generate_file_sas( file_client.account_name, file_client.file_system_name, None, file_client.path_name, user_delegation_key, permission=FileSasPermissions(execute=True, manage_access_control=True, manage_ownership=True), expiry=datetime.utcnow() + timedelta(hours=1), ) # doanload the data and make sure it is the same as uploaded data new_file_client = DataLakeFileClient( self._get_account_url(datalake_storage_account_name), file_client.file_system_name, file_client.path_name, credential=sas_token) acl = 'user::rwx,group::r-x,other::rwx' owner = "dc140949-53b7-44af-b1e9-cd994951fb86" new_file_client.set_access_control(acl=acl, owner=owner) access_control = new_file_client.get_access_control() self.assertEqual(acl, access_control['acl']) self.assertEqual(owner, access_control['owner'])
def initialize_storage_account(storage_account_name, storage_account_key): try: global service_client service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format( "https", storage_account_name), credential=storage_account_key) #print(service_client) except Exception as e: print(e)
def create_datalake_service_client() -> DataLakeServiceClient: """Return DataLake Service Client.""" account_name = DL.STORAGE_ACCOUNT_NAME credential = DL.STORAGE_ACCOUNT_KEY account_url = f"https://{account_name}.dfs.core.windows.net/" datalake_service = DataLakeServiceClient(account_url=account_url, credential=credential) return datalake_service
def inner_transfer(self, directory, token): basename = "upload" suffix = datetime.now().strftime("%y%m%d_%H%M%S") filename = "_".join([basename, suffix]) data = self.get_random_bytes(200*1024) upload_path = "{}/{}".format(directory, filename) print("\nUploading {} to directory: {}".format(filename, directory)) service_client = DataLakeServiceClient(self.STORAGE_URL, credential=token) file_client = service_client.get_file_client(self.STORAGE_FILESYSTEM, upload_path) file_client.upload_data(data, overwrite=True, max_concurrency=3) print("Upload complete. Re-downloading file...") downloaded_data = file_client.download_file().readall() print("Downloaded file. Bytes read: {}".format(len(downloaded_data)))
def __missing__(self, key): endpoint, fs_name = key if endpoint not in self._service_clients: self._service_clients[endpoint] = DataLakeServiceClient( endpoint, credential=storage_account_key ) fs_client = self._service_clients[endpoint].get_file_system_client(fs_name) self[key] = fs_client return fs_client