Beispiel #1
0
def create_adls2_client(storage_account, credential):
    """
    Create an ADLS2 client.
    """
    account_url = _create_url(storage_account, "dfs")
    return DataLakeServiceClient(account_url, credential)
Beispiel #2
0
# Create aad group
group = GroupCreateParameters(display_name=group_name,
                              mail_nickname="GroupMail-at-microsoft.com")
graphrbac_client.groups.create(group)

# Change permissions of the bash script used to retrieve aad group ID
os.chmod('./script.sh', 0o755)
rc = subprocess.call("./script.sh")

# Retrieve the aad group ID from file
f = open("groupid.txt")
group_id = str(f.readline())

# Create storage account credentials
storage_creds = ClientSecretCredential(tenant_id=tenant_id,
                                       client_id=client_id,
                                       client_secret=client_secret)

# Perform the data lake tasks
dl_service_client = DataLakeServiceClient(
    account_url="https://bluesofttaskdd.dfs.core.windows.net/",
    credential=storage_creds)

file_system_client = dl_service_client.get_file_system_client(
    file_system='file-system')
file_system_client.create_directory(group_name)

directory_client = file_system_client.get_directory_client(group_name)
directory_client.set_access_control(acl=f"default:group:{group_id}:r-x")
Beispiel #3
0
from werkzeug import secure_filename
from flask import Flask, flash, request, redirect, url_for
from azure.storage.filedatalake import (
    DataLakeServiceClient, )

app = Flask(__name__)

ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}

account_name = os.getenv('STORAGE_ACCOUNT_NAME', "")
account_key = os.getenv('STORAGE_ACCOUNT_KEY', "")

# set up the service client with the credentials from the environment variables
service_client = DataLakeServiceClient(
    account_url="{}://{}.dfs.core.windows.net".format("https", account_name),
    credential=account_key)

print("Connected to service client")

# generate a random name for testing purpose
fs_name = "testfs{}".format(random.randint(1, 1000))
print("Generating a test filesystem named '{}'.".format(fs_name))

# create the filesystem
filesystem_client = service_client.create_file_system(file_system=fs_name)

print("Created filesystem")


def upload_file(filesystem_client, file):
 def _setUp(self, account_name, account_key):
     url = self.account_url(account_name, 'dfs')
     self.dsc = DataLakeServiceClient(url, account_key)
     self.config = self.dsc._config
     self.test_file_systems = []
Beispiel #5
0
    logger.info("Authenticating to Box...")

    # Login as both folder's app user and service user.
    box_client_app = auth_jwt(box_config=BOX_CONFIG)
    # box_client_user = auth_jwt(box_config=join('config', 'box_config.json'),
    #                            user_id=BOX_USER)

    # To access box folders, collaborations must be in place.
    # check_or_create_collab(box_user_client=box_client_user,
    #                        box_service_client=box_client_app,
    #                        box_folder_id=BOX_FOLDER)

    logger.info("Authenticating to lake...")

    lake_service = DataLakeServiceClient(account_url=LAKE_URL,
                                         credential=LAKE_KEY)

    etl_ops = EtlOperations(box_client=box_client_app,
                            lake_client=lake_service,
                            lake_root=LAKE_CONTAINER,
                            target=LAKE_PATH)

    logger.info("Pulling daily files...")
    etl_ops.daily_pull(source=BOX_PATH,
                       source_mask=BOX_MASK,
                       source_rename=BOX_RENAME,
                       prev_label=TAB_NAME_PREV,
                       curr_label=TAB_NAME_CURR,
                       next_label=TAB_NAME_NEXT)

    logger.info("Pulling weekly files...")
Beispiel #6
0
def main(event: func.EventGridEvent):
    result = json.dumps({
        'id': event.id,
        'data': event.get_json(),
        'topic': event.topic,
        'subject': event.subject,
        'event_type': event.event_type,
    })

    logging.info('Python EventGrid trigger processed an event: %s', result)

    blob_url = event.get_json().get('url')
    logging.info('blob URL: %s', blob_url)
    blob_name = blob_url.split("/")[-1].split("?")[0]
    logging.info('blob name: %s', blob_name)
    origin_container_name = blob_url.split("/")[-2].split("?")[0]
    logging.info('container name: %s', origin_container_name)
    storage_account_name = blob_url.split("//")[1].split(".")[0]
    logging.info('storage account name: %s', storage_account_name)

    ams_account_name = os.getenv('ACCOUNTNAME')
    resource_group_name = os.getenv('RESOURCEGROUP')
    subscription_id = os.getenv('SUBSCRIPTIONID')
    client_id = os.getenv('AZURE_CLIENT_ID')
    client_secret = os.getenv('AZURE_CLIENT_SECRET')
    TENANT_ID = os.getenv('AZURE_TENANT_ID')
    storage_blob_url = 'https://' + storage_account_name + '.blob.core.windows.net/'
    transform_name = 'faceredact'
    LOGIN_ENDPOINT = AZURE_PUBLIC_CLOUD.endpoints.active_directory
    RESOURCE = AZURE_PUBLIC_CLOUD.endpoints.active_directory_resource_id

    logging.info('login_endpoint: %s', LOGIN_ENDPOINT)
    logging.info('tenant_id: %s', TENANT_ID)

    out_asset_name = 'faceblurringOutput_' + datetime.utcnow().strftime(
        "%m-%d-%Y_%H:%M:%S")
    out_alternate_id = 'faceblurringOutput_' + datetime.utcnow().strftime(
        "%m-%d-%Y_%H:%M:%S")
    out_description = 'Redacted video with blurred faces'

    context = adal.AuthenticationContext(LOGIN_ENDPOINT + "/" + TENANT_ID)
    credentials = AdalAuthentication(
        context.acquire_token_with_client_credentials, RESOURCE, client_id,
        client_secret)
    client = AzureMediaServices(credentials, subscription_id)

    output_asset = Asset(alternate_id=out_alternate_id,
                         description=out_description)
    client.assets.create_or_update(resource_group_name, ams_account_name,
                                   out_asset_name, output_asset)

    token_credential = DefaultAzureCredential()
    datalake_service_client = DataLakeServiceClient(
        account_url=storage_blob_url, credential=token_credential)

    delegation_key = datalake_service_client.get_user_delegation_key(
        key_start_time=datetime.utcnow(),
        key_expiry_time=datetime.utcnow() + timedelta(hours=1))

    sas_token = generate_file_sas(account_name=storage_account_name,
                                  file_system_name=origin_container_name,
                                  directory_name="",
                                  file_name=blob_name,
                                  credential=delegation_key,
                                  permission=FileSasPermissions(read=True),
                                  expiry=datetime.utcnow() +
                                  timedelta(hours=1),
                                  protocol="https")

    sas_url = "{}?{}".format(blob_url, sas_token)
    logging.info(sas_url)

    job_name = 'Faceblurring-job_' + datetime.utcnow().strftime(
        "%m-%d-%Y_%H:%M:%S")
    job_input = JobInputHttp(label="Video_asset", files=[sas_url])
    job_output = JobOutputAsset(asset_name=out_asset_name)
    job_parameters = Job(input=job_input, outputs=[job_output])

    client.jobs.create(resource_group_name,
                       ams_account_name,
                       transform_name,
                       job_name,
                       parameters=job_parameters)
Beispiel #7
0
import os
import random
import uuid

from azure.core.exceptions import ResourceExistsError
from azure.storage.filedatalake import DataLakeServiceClient

#connection_string = os.getenv('AZURE_STORAGE_LAKE_CONNECTION_STRING')

account_name = os.getenv("STORAGE_ACCOUNT_NAME")
credential = os.getenv("STORAGE_ACCOUNT_KEY")
account_url = "https://{}.dfs.core.windows.net/".format(account_name)

file_name = 'yellow_tripdata_2020-01.csv'

datalake_service = DataLakeServiceClient(account_url=account_url,
                                         credential=credential)
file_system = "chernysh"  # like container in BlobServiceClient

try:
    filesystem_client = datalake_service.create_file_system(
        file_system=file_system)

    dir_client = filesystem_client.get_directory_client("folder_yellow")
    dir_client.create_directory()

    with open(file_name, "rb") as data:
        filesystem_client = dir_client.get_file_client(file_name)
        filesystem_client.create_file()
        filesystem_client.append_data(data, 0)
        filesystem_client.flush_data(data.tell())
Beispiel #8
0
def run():
    account_name = ""
    account_key = ""
    account_url = "{}://{}.dfs.core.windows.net".format("https", account_name)
    fs_name = ""
    userName = ""
    sourceDir = ""

    service_client = DataLakeServiceClient(account_url, credential=account_key)
    print()
    print()
    # Using existing file system (Admin created)
    print(" - Finding a filesystem named '{}'.".format(fs_name))
    filesystem_client = service_client.get_file_system_client(
        file_system=fs_name)

    # Creating a folder based on user name

    print(" - Creating a directory named '{}'.".format(userName))
    directory_client = filesystem_client.create_directory(
        userName,
        content_settings=None,
        metadata={
            'Source': 'rail_data',
            'sourceUrl': 'http://127.0.0.1?13456789'
        })

    # Set permissions on folder for XID
    acl = "user::rwx,user:{}@company.com:rwx,group::r-x,mask::rwx,other::---,default:user::rwx,default:user:{}@company.com:rwx,default:group::r-x,default:mask::rwx,default:other::---".format(
        userName, userName)
    print(" - Setting permissions on named '{}'.".format(userName))
    directory_client.set_access_control(owner=None,
                                        group=None,
                                        permissions=None,
                                        acl=acl)

    # uploading all files in a directory

    print(" - Uploading all files in directory")

    for file_name in os.listdir(sourceDir):
        print("     - Opening a file named '{}'.".format(file_name))
        data = open("{}\\{}".format(sourceDir, file_name), "r")
        print("     - Uploading a file named '{}'.".format(file_name))
        file_client = directory_client.create_file(
            file_name,
            content_settings=None,
            metadata={'SourceFileName': file_name})
        data = data.read()
        file_client.append_data(data, offset=0, length=len(data))
        file_client.flush_data(len(data))
        print("     - Finished uploading '{}'.".format(file_name))

    print(" - Finished uploading all files in directory")

    # create a PBIDS file

    data = {"version": "0.1"}
    data['connections'] = []
    data['connections'].append({
        'details': {
            'protocol': 'azure-data-lake-storage',
            "address": {
                'server': '',
                "path": ''
            }
        },
        "options": {},
        "mode": "Import"
    })
    data['connections'][0]['details']['address'].update(
        {'server': account_url})
    data['connections'][0]['details']['address'].update(
        {'path': "/{}/{}".format(fs_name, userName)})
    print(" - Creating PBIDS file: '{}'.".format(userName + '.PBIDS'))

    with open(userName + '.PBIDS', 'w') as outfile:
        json.dump(data, outfile)
Beispiel #9
0
def list_directory_contents():
    try:

        file_system_client = service_client.get_file_system_client(
            file_system="test")

        paths = file_system_client.get_paths(path="test")

        for path in paths:
            print(path.name + '\n')

    except Exception as e:
        print(e)


try:
    global service_client

    service_client = DataLakeServiceClient(
        account_url="{}://{}.dfs.core.windows.net".format(
            "https", 'adlsgen2encrypted'),
        credential=
        'mVEft0AuOaK2eDpEmPJ+Nb+SesJQKVj/b4noCaAmcAXyiOwWxpExK5jf+ZGe5N+Vc938A5ShYbf4z3D1zaH4TA=='
    )

except Exception as e:
    print(e)

list_directory_contents()
# limitation of liability; and (iv) to indemnify, hold harmless, and defend Microsoft, its affiliates and
# suppliers from and against any third party claims or lawsuits, including attorneys’ fees, that arise or result
# from the use or distribution of the sample code.

# The below sample connects to ADLS Gen2 Account via a Service Principle and then perform listing of file systems

import os
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import ClientSecretCredential

AZURE_CLIENT_ID = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
AZURE_TENANT_ID = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
AZURE_CLIENT_SECRET = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
AZURE_STORAGE_ACCOUNT_NAME = 'XXXXXXXXXXXXXXXXXXXXXXXXXX'

configcredentials = ClientSecretCredential(
    client_id=AZURE_CLIENT_ID,
    tenant_id=AZURE_TENANT_ID,
    client_secret=AZURE_CLIENT_SECRET)

# Construct the DatalakeServiceClient
service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
        "https",
        AZURE_STORAGE_ACCOUNT_NAME),
        credential=configcredentials)

# Performing Listing
file_systems = service_client.list_file_systems()
for file_system in file_systems:
    print(file_system.name)
 def _setUp(self, account_name, account_key):
     url = self._get_account_url(account_name)
     self.dsc = DataLakeServiceClient(url, account_key)
     self.config = self.dsc._config
Beispiel #12
0
def datalake_client(account_name, account_key):
    account_url = "https://{}.dfs.core.windows.net/".format(account_name)
    client = DataLakeServiceClient(account_url=account_url,
                                   credential=account_key)
    yield client
# Enable VMs Managed Identity and assign necessary RBAC or ACLs to the identity on your ADLS Gen 2 Storage account

# This class file contains Python 3 sample code to interact with an ADLS Gen 2 Storage account
# using an Azure VMs Managed Identity for authentication

import os, uuid, sys
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import ManagedIdentityCredential

#using DefaultAzureCredential to use VMs Managed Service Identity
credential = ManagedIdentityCredential()
try:
    #create a DataLakeServiceClient with VMs MSI Credential
    global service_client
    service_client = DataLakeServiceClient(
        account_url="{}://{}.dfs.core.windows.net".format(
            "https", "adlsgen2account"),
        credential=credential)
    print("Create a data lake service client")

    #create a file system client and create a new filesystem/container
    global file_system_client
    file_system_client = service_client.create_file_system(
        file_system="file-system")
    print("New file system created")

    #create a new directory in the filesystem
    file_system_client.create_directory("my-directory")
    print("New directory created")

    print("Uploading local file to ADLS Gen 2")
    #get the client of the newly created directory
### Connect to Azure storage
## Get key and create file systems

storage_client = StorageManagementClient(credentials,
                                         os.environ.get("subscription_id"))

storage_keys = storage_client.storage_accounts.list_keys(
    os.environ.get("resource_group_name"),
    os.environ.get("storageAccountName"))

storage_keys = {v.key_name: v.value for v in storage_keys.keys}
print('\tKey 1: {}'.format(storage_keys['key1']))
print('\tKey 2: {}'.format(storage_keys['key2']))

datalake_client = DataLakeServiceClient(
    account_url="{}://{}.dfs.core.windows.net".format(
        "https", os.environ.get("storageAccountName")),
    credential=storage_keys['key1'])
print("Creating file systems")
datalake_client.create_file_system(file_system="bronze")
datalake_client.create_file_system(file_system="silver")
datalake_client.create_file_system(file_system="gold")
datalake_client.create_file_system(file_system="sandbox")

#############################
### Connect to Databricks
### Need to generate our Databricks Tokens


## Generate AAD Tokens
def get_aad_token(client_id, client_secret):
    # Acquire a token to authenticate against Azure management API
Beispiel #15
0
import os, uuid, sys, pprint, pandas as pd
from azure.storage.filedatalake import DataLakeServiceClient
import time
try:
    global service_client

    service_client = DataLakeServiceClient(
        account_url="{}://{}.dfs.core.windows.net".format("https", "deepadls"),
        credential=
        "LmVS8nloXT9OHIKUaniYzTCkHapSl3K3U6T5hL4wB6KX5Fky9DDFY1r63fksBJ+2xgmkIxu0ljBZlv3p+N47kQ=="
    )
except Exception as e:
    print(e)

container = 'ny311'
directory = 'deeptesthub1'

# def master_function(container,directory):
#     try:

#         file_system_client = service_client.get_file_system_client(file_system=container)
#         directory_client = file_system_client.get_directory_client(directory)

#         acl_props = directory_client.get_access_control()

#         df = (acl_props['acl'])

#         ch = 'y'

#         df1 = pd.DataFrame([x for x in df.split('\n')[0].split(',')])
#         df2 = df1[0].str.split(':',expand =True)
Beispiel #16
0
 def setUp(self):
     super(FileSystemTest, self).setUp()
     url = self._get_account_url()
     self.dsc = DataLakeServiceClient(url, credential=self.settings.STORAGE_DATA_LAKE_ACCOUNT_KEY)
     self.config = self.dsc._config
     self.test_file_systems = []
Beispiel #17
0
if __name__ == '__main__':
    if len(sys.argv) != 4:
        print('Please use the following syntax to call the script:')
        print('\tadls-acl.py <STORAGE_ACCT_NAME> <FILE_SYSTEM_NAME> <PATH>')
        print('Example:')
        print(
            '\tadls-acl.py mystorageaccountname rawdata folder1/subfolder1/subfolder1-2'
        )
        sys.exit()
    else:
        ACCOUNT_NAME, FILE_SYSTEM, TARGET_DIR = sys.argv[1:]

    # Clients
    credential = DefaultAzureCredential()
    service = DataLakeServiceClient(
        account_url=f'https://{ACCOUNT_NAME}.dfs.core.windows.net/',
        credential=credential)
    filesystem = service.get_file_system_client(file_system=FILE_SYSTEM)

    print('*' * 20)
    print(f'Storage Account Name: {ACCOUNT_NAME}')
    print(f'File System Name: {FILE_SYSTEM}')
    print('*' * 20)
    print(
        f'Running: Setting ACLs for all child paths (subdirectories and files) in {TARGET_DIR} to match parent.'
    )
    total_start = time.time()  # Start Timing
    asyncio.run(main(TARGET_DIR, filesystem))
    total_end = time.time()  # End Timing
    print("Complete: Recursive ACL configuration took {} seconds.".format(
        str(round(total_end - total_start, 2))))