def create_adls2_client(storage_account, credential): """ Create an ADLS2 client. """ account_url = _create_url(storage_account, "dfs") return DataLakeServiceClient(account_url, credential)
# Create aad group group = GroupCreateParameters(display_name=group_name, mail_nickname="GroupMail-at-microsoft.com") graphrbac_client.groups.create(group) # Change permissions of the bash script used to retrieve aad group ID os.chmod('./script.sh', 0o755) rc = subprocess.call("./script.sh") # Retrieve the aad group ID from file f = open("groupid.txt") group_id = str(f.readline()) # Create storage account credentials storage_creds = ClientSecretCredential(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret) # Perform the data lake tasks dl_service_client = DataLakeServiceClient( account_url="https://bluesofttaskdd.dfs.core.windows.net/", credential=storage_creds) file_system_client = dl_service_client.get_file_system_client( file_system='file-system') file_system_client.create_directory(group_name) directory_client = file_system_client.get_directory_client(group_name) directory_client.set_access_control(acl=f"default:group:{group_id}:r-x")
from werkzeug import secure_filename from flask import Flask, flash, request, redirect, url_for from azure.storage.filedatalake import ( DataLakeServiceClient, ) app = Flask(__name__) ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'} account_name = os.getenv('STORAGE_ACCOUNT_NAME', "") account_key = os.getenv('STORAGE_ACCOUNT_KEY', "") # set up the service client with the credentials from the environment variables service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format("https", account_name), credential=account_key) print("Connected to service client") # generate a random name for testing purpose fs_name = "testfs{}".format(random.randint(1, 1000)) print("Generating a test filesystem named '{}'.".format(fs_name)) # create the filesystem filesystem_client = service_client.create_file_system(file_system=fs_name) print("Created filesystem") def upload_file(filesystem_client, file):
def _setUp(self, account_name, account_key): url = self.account_url(account_name, 'dfs') self.dsc = DataLakeServiceClient(url, account_key) self.config = self.dsc._config self.test_file_systems = []
logger.info("Authenticating to Box...") # Login as both folder's app user and service user. box_client_app = auth_jwt(box_config=BOX_CONFIG) # box_client_user = auth_jwt(box_config=join('config', 'box_config.json'), # user_id=BOX_USER) # To access box folders, collaborations must be in place. # check_or_create_collab(box_user_client=box_client_user, # box_service_client=box_client_app, # box_folder_id=BOX_FOLDER) logger.info("Authenticating to lake...") lake_service = DataLakeServiceClient(account_url=LAKE_URL, credential=LAKE_KEY) etl_ops = EtlOperations(box_client=box_client_app, lake_client=lake_service, lake_root=LAKE_CONTAINER, target=LAKE_PATH) logger.info("Pulling daily files...") etl_ops.daily_pull(source=BOX_PATH, source_mask=BOX_MASK, source_rename=BOX_RENAME, prev_label=TAB_NAME_PREV, curr_label=TAB_NAME_CURR, next_label=TAB_NAME_NEXT) logger.info("Pulling weekly files...")
def main(event: func.EventGridEvent): result = json.dumps({ 'id': event.id, 'data': event.get_json(), 'topic': event.topic, 'subject': event.subject, 'event_type': event.event_type, }) logging.info('Python EventGrid trigger processed an event: %s', result) blob_url = event.get_json().get('url') logging.info('blob URL: %s', blob_url) blob_name = blob_url.split("/")[-1].split("?")[0] logging.info('blob name: %s', blob_name) origin_container_name = blob_url.split("/")[-2].split("?")[0] logging.info('container name: %s', origin_container_name) storage_account_name = blob_url.split("//")[1].split(".")[0] logging.info('storage account name: %s', storage_account_name) ams_account_name = os.getenv('ACCOUNTNAME') resource_group_name = os.getenv('RESOURCEGROUP') subscription_id = os.getenv('SUBSCRIPTIONID') client_id = os.getenv('AZURE_CLIENT_ID') client_secret = os.getenv('AZURE_CLIENT_SECRET') TENANT_ID = os.getenv('AZURE_TENANT_ID') storage_blob_url = 'https://' + storage_account_name + '.blob.core.windows.net/' transform_name = 'faceredact' LOGIN_ENDPOINT = AZURE_PUBLIC_CLOUD.endpoints.active_directory RESOURCE = AZURE_PUBLIC_CLOUD.endpoints.active_directory_resource_id logging.info('login_endpoint: %s', LOGIN_ENDPOINT) logging.info('tenant_id: %s', TENANT_ID) out_asset_name = 'faceblurringOutput_' + datetime.utcnow().strftime( "%m-%d-%Y_%H:%M:%S") out_alternate_id = 'faceblurringOutput_' + datetime.utcnow().strftime( "%m-%d-%Y_%H:%M:%S") out_description = 'Redacted video with blurred faces' context = adal.AuthenticationContext(LOGIN_ENDPOINT + "/" + TENANT_ID) credentials = AdalAuthentication( context.acquire_token_with_client_credentials, RESOURCE, client_id, client_secret) client = AzureMediaServices(credentials, subscription_id) output_asset = Asset(alternate_id=out_alternate_id, description=out_description) client.assets.create_or_update(resource_group_name, ams_account_name, out_asset_name, output_asset) token_credential = DefaultAzureCredential() datalake_service_client = DataLakeServiceClient( account_url=storage_blob_url, credential=token_credential) delegation_key = datalake_service_client.get_user_delegation_key( key_start_time=datetime.utcnow(), key_expiry_time=datetime.utcnow() + timedelta(hours=1)) sas_token = generate_file_sas(account_name=storage_account_name, file_system_name=origin_container_name, directory_name="", file_name=blob_name, credential=delegation_key, permission=FileSasPermissions(read=True), expiry=datetime.utcnow() + timedelta(hours=1), protocol="https") sas_url = "{}?{}".format(blob_url, sas_token) logging.info(sas_url) job_name = 'Faceblurring-job_' + datetime.utcnow().strftime( "%m-%d-%Y_%H:%M:%S") job_input = JobInputHttp(label="Video_asset", files=[sas_url]) job_output = JobOutputAsset(asset_name=out_asset_name) job_parameters = Job(input=job_input, outputs=[job_output]) client.jobs.create(resource_group_name, ams_account_name, transform_name, job_name, parameters=job_parameters)
import os import random import uuid from azure.core.exceptions import ResourceExistsError from azure.storage.filedatalake import DataLakeServiceClient #connection_string = os.getenv('AZURE_STORAGE_LAKE_CONNECTION_STRING') account_name = os.getenv("STORAGE_ACCOUNT_NAME") credential = os.getenv("STORAGE_ACCOUNT_KEY") account_url = "https://{}.dfs.core.windows.net/".format(account_name) file_name = 'yellow_tripdata_2020-01.csv' datalake_service = DataLakeServiceClient(account_url=account_url, credential=credential) file_system = "chernysh" # like container in BlobServiceClient try: filesystem_client = datalake_service.create_file_system( file_system=file_system) dir_client = filesystem_client.get_directory_client("folder_yellow") dir_client.create_directory() with open(file_name, "rb") as data: filesystem_client = dir_client.get_file_client(file_name) filesystem_client.create_file() filesystem_client.append_data(data, 0) filesystem_client.flush_data(data.tell())
def run(): account_name = "" account_key = "" account_url = "{}://{}.dfs.core.windows.net".format("https", account_name) fs_name = "" userName = "" sourceDir = "" service_client = DataLakeServiceClient(account_url, credential=account_key) print() print() # Using existing file system (Admin created) print(" - Finding a filesystem named '{}'.".format(fs_name)) filesystem_client = service_client.get_file_system_client( file_system=fs_name) # Creating a folder based on user name print(" - Creating a directory named '{}'.".format(userName)) directory_client = filesystem_client.create_directory( userName, content_settings=None, metadata={ 'Source': 'rail_data', 'sourceUrl': 'http://127.0.0.1?13456789' }) # Set permissions on folder for XID acl = "user::rwx,user:{}@company.com:rwx,group::r-x,mask::rwx,other::---,default:user::rwx,default:user:{}@company.com:rwx,default:group::r-x,default:mask::rwx,default:other::---".format( userName, userName) print(" - Setting permissions on named '{}'.".format(userName)) directory_client.set_access_control(owner=None, group=None, permissions=None, acl=acl) # uploading all files in a directory print(" - Uploading all files in directory") for file_name in os.listdir(sourceDir): print(" - Opening a file named '{}'.".format(file_name)) data = open("{}\\{}".format(sourceDir, file_name), "r") print(" - Uploading a file named '{}'.".format(file_name)) file_client = directory_client.create_file( file_name, content_settings=None, metadata={'SourceFileName': file_name}) data = data.read() file_client.append_data(data, offset=0, length=len(data)) file_client.flush_data(len(data)) print(" - Finished uploading '{}'.".format(file_name)) print(" - Finished uploading all files in directory") # create a PBIDS file data = {"version": "0.1"} data['connections'] = [] data['connections'].append({ 'details': { 'protocol': 'azure-data-lake-storage', "address": { 'server': '', "path": '' } }, "options": {}, "mode": "Import" }) data['connections'][0]['details']['address'].update( {'server': account_url}) data['connections'][0]['details']['address'].update( {'path': "/{}/{}".format(fs_name, userName)}) print(" - Creating PBIDS file: '{}'.".format(userName + '.PBIDS')) with open(userName + '.PBIDS', 'w') as outfile: json.dump(data, outfile)
def list_directory_contents(): try: file_system_client = service_client.get_file_system_client( file_system="test") paths = file_system_client.get_paths(path="test") for path in paths: print(path.name + '\n') except Exception as e: print(e) try: global service_client service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", 'adlsgen2encrypted'), credential= 'mVEft0AuOaK2eDpEmPJ+Nb+SesJQKVj/b4noCaAmcAXyiOwWxpExK5jf+ZGe5N+Vc938A5ShYbf4z3D1zaH4TA==' ) except Exception as e: print(e) list_directory_contents()
# limitation of liability; and (iv) to indemnify, hold harmless, and defend Microsoft, its affiliates and # suppliers from and against any third party claims or lawsuits, including attorneys’ fees, that arise or result # from the use or distribution of the sample code. # The below sample connects to ADLS Gen2 Account via a Service Principle and then perform listing of file systems import os from azure.storage.filedatalake import DataLakeServiceClient from azure.identity import ClientSecretCredential AZURE_CLIENT_ID = "XXXXXXXXXXXXXXXXXXXXXXXXXX" AZURE_TENANT_ID = "XXXXXXXXXXXXXXXXXXXXXXXXXX" AZURE_CLIENT_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXX" AZURE_STORAGE_ACCOUNT_NAME = 'XXXXXXXXXXXXXXXXXXXXXXXXXX' configcredentials = ClientSecretCredential( client_id=AZURE_CLIENT_ID, tenant_id=AZURE_TENANT_ID, client_secret=AZURE_CLIENT_SECRET) # Construct the DatalakeServiceClient service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format( "https", AZURE_STORAGE_ACCOUNT_NAME), credential=configcredentials) # Performing Listing file_systems = service_client.list_file_systems() for file_system in file_systems: print(file_system.name)
def _setUp(self, account_name, account_key): url = self._get_account_url(account_name) self.dsc = DataLakeServiceClient(url, account_key) self.config = self.dsc._config
def datalake_client(account_name, account_key): account_url = "https://{}.dfs.core.windows.net/".format(account_name) client = DataLakeServiceClient(account_url=account_url, credential=account_key) yield client
# Enable VMs Managed Identity and assign necessary RBAC or ACLs to the identity on your ADLS Gen 2 Storage account # This class file contains Python 3 sample code to interact with an ADLS Gen 2 Storage account # using an Azure VMs Managed Identity for authentication import os, uuid, sys from azure.storage.filedatalake import DataLakeServiceClient from azure.identity import ManagedIdentityCredential #using DefaultAzureCredential to use VMs Managed Service Identity credential = ManagedIdentityCredential() try: #create a DataLakeServiceClient with VMs MSI Credential global service_client service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", "adlsgen2account"), credential=credential) print("Create a data lake service client") #create a file system client and create a new filesystem/container global file_system_client file_system_client = service_client.create_file_system( file_system="file-system") print("New file system created") #create a new directory in the filesystem file_system_client.create_directory("my-directory") print("New directory created") print("Uploading local file to ADLS Gen 2") #get the client of the newly created directory
### Connect to Azure storage ## Get key and create file systems storage_client = StorageManagementClient(credentials, os.environ.get("subscription_id")) storage_keys = storage_client.storage_accounts.list_keys( os.environ.get("resource_group_name"), os.environ.get("storageAccountName")) storage_keys = {v.key_name: v.value for v in storage_keys.keys} print('\tKey 1: {}'.format(storage_keys['key1'])) print('\tKey 2: {}'.format(storage_keys['key2'])) datalake_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format( "https", os.environ.get("storageAccountName")), credential=storage_keys['key1']) print("Creating file systems") datalake_client.create_file_system(file_system="bronze") datalake_client.create_file_system(file_system="silver") datalake_client.create_file_system(file_system="gold") datalake_client.create_file_system(file_system="sandbox") ############################# ### Connect to Databricks ### Need to generate our Databricks Tokens ## Generate AAD Tokens def get_aad_token(client_id, client_secret): # Acquire a token to authenticate against Azure management API
import os, uuid, sys, pprint, pandas as pd from azure.storage.filedatalake import DataLakeServiceClient import time try: global service_client service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format("https", "deepadls"), credential= "LmVS8nloXT9OHIKUaniYzTCkHapSl3K3U6T5hL4wB6KX5Fky9DDFY1r63fksBJ+2xgmkIxu0ljBZlv3p+N47kQ==" ) except Exception as e: print(e) container = 'ny311' directory = 'deeptesthub1' # def master_function(container,directory): # try: # file_system_client = service_client.get_file_system_client(file_system=container) # directory_client = file_system_client.get_directory_client(directory) # acl_props = directory_client.get_access_control() # df = (acl_props['acl']) # ch = 'y' # df1 = pd.DataFrame([x for x in df.split('\n')[0].split(',')]) # df2 = df1[0].str.split(':',expand =True)
def setUp(self): super(FileSystemTest, self).setUp() url = self._get_account_url() self.dsc = DataLakeServiceClient(url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY) self.config = self.dsc._config self.test_file_systems = []
if __name__ == '__main__': if len(sys.argv) != 4: print('Please use the following syntax to call the script:') print('\tadls-acl.py <STORAGE_ACCT_NAME> <FILE_SYSTEM_NAME> <PATH>') print('Example:') print( '\tadls-acl.py mystorageaccountname rawdata folder1/subfolder1/subfolder1-2' ) sys.exit() else: ACCOUNT_NAME, FILE_SYSTEM, TARGET_DIR = sys.argv[1:] # Clients credential = DefaultAzureCredential() service = DataLakeServiceClient( account_url=f'https://{ACCOUNT_NAME}.dfs.core.windows.net/', credential=credential) filesystem = service.get_file_system_client(file_system=FILE_SYSTEM) print('*' * 20) print(f'Storage Account Name: {ACCOUNT_NAME}') print(f'File System Name: {FILE_SYSTEM}') print('*' * 20) print( f'Running: Setting ACLs for all child paths (subdirectories and files) in {TARGET_DIR} to match parent.' ) total_start = time.time() # Start Timing asyncio.run(main(TARGET_DIR, filesystem)) total_end = time.time() # End Timing print("Complete: Recursive ACL configuration took {} seconds.".format( str(round(total_end - total_start, 2))))