def file_system_sample(self): # [START create_file_system_client_from_service] # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "mynewfilesystem") # [END create_file_system_client_from_service] try: # [START create_file_system] file_system_client.create_file_system() # [END create_file_system] # [START get_file_system_properties] properties = file_system_client.get_file_system_properties() # [END get_file_system_properties] finally: # [START delete_file_system] file_system_client.delete_file_system()
def acquire_lease_on_file_system(self): # Instantiate a DataLakeServiceClient using a connection string # [START create_data_lake_service_client_from_conn_str] from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # [END create_data_lake_service_client_from_conn_str] # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "myleasefilesystem") # Create new File System try: file_system_client.create_file_system() except ResourceExistsError: pass # [START acquire_lease_on_file_system] # Acquire a lease on the file system lease = file_system_client.acquire_lease() # Delete file system by passing in the lease file_system_client.delete_file_system(lease=lease)
def list_paths_in_file_system(self): # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "myfilesystemforlistpaths") # Create new File System file_system_client.create_file_system() # [START upload_file_to_file_system] with open(SOURCE_FILE, "rb") as data: file_client = file_system_client.get_file_client("myfile") file_client.create_file() file_client.append_data(data, 0) file_client.flush_data(data.tell()) # [END upload_file_to_file_system] # [START get_paths_in_file_system] path_list = file_system_client.get_paths() for path in path_list: print(path.name + '\n') # [END get_paths_in_file_system] # Delete file system file_system_client.delete_file_system()
def get_file_system_client(self): connect_str = os.environ["ADLS_CONNECTION_STRING"] service_client = DataLakeServiceClient.from_connection_string( connect_str) file_system_client = service_client.get_file_system_client( file_system=self.file_system_name) return file_system_client
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) self.file_system_client = datalake_service_client.get_file_system_client( self.container_name)
def set_metadata_on_file_system(self): # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "mymetadatafilesystemsync") try: # Create new File System file_system_client.create_file_system() # [START set_file_system_metadata] # Create key, value pairs for metadata metadata = {'type': 'test'} # Set metadata on the file system file_system_client.set_file_system_metadata(metadata=metadata) # [END set_file_system_metadata] # Get file system properties properties = file_system_client.get_file_system_properties() finally: # Delete file system file_system_client.delete_file_system()
def get_directory_client_from_file_system(self): # Instantiate a DataLakeServiceClient using a connection string from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # Instantiate a FileSystemClient file_system_client = datalake_service_client.get_file_system_client( "myfilesystem") # Create new File System try: file_system_client.create_file_system() except ResourceExistsError: pass # [START get_directory_client_from_file_system] # Get the DataLakeDirectoryClient from the FileSystemClient to interact with a specific file directory_client = file_system_client.get_directory_client( "mynewdirectory") # [END get_directory_client_from_file_system] # Delete file system file_system_client.delete_file_system()
def upload_dir_datalake(path: str, file_system_name: str = 'p4-data'): try: ser_cli = DataLakeServiceClient.from_connection_string( config.AZURE_STORAGE_CONNECTION_STRING) filesys_cli = ser_cli.get_file_system_client( file_system=file_system_name) dir_cli = filesys_cli.get_directory_client(path) csv_files = glob.glob(f'{path}/**/*.csv', recursive=True) for csv_f in csv_files: # afile = 'results/pi/common apis_1595417687.0704062.csv' file_cli = dir_cli.get_file_client(csv_f) with open(csv_f, 'r') as f: file_cli.upload_data(f.read(), overwrite=True) except Exception as e: print(e)
def main(): try: CONNECTION_STRING = os.environ['AZURE_STORAGE_CONNECTION_STRING'] except KeyError: print("AZURE_STORAGE_CONNECTION_STRING must be set.") sys.exit(1) datalake_service_client = DataLakeServiceClient.from_connection_string( CONNECTION_STRING) filesystem_name = "quickqueryfilesystem" filesystem_client = datalake_service_client.get_file_system_client( filesystem_name) try: filesystem_client.create_file_system() except: pass # [START query] errors = [] def on_error(error): errors.append(error) # upload the csv file file_client = datalake_service_client.get_file_client( filesystem_name, "csvfile") file_client.upload_data(CSV_DATA, overwrite=True) # select the second column of the csv file query_expression = "SELECT _2 from DataLakeStorage" input_format = DelimitedTextDialect(delimiter=',', quotechar='"', lineterminator='\n', escapechar="", has_header=False) output_format = DelimitedJsonDialect(delimiter='\n') reader = file_client.query_file(query_expression, on_error=on_error, file_format=input_format, output_format=output_format) content = reader.readall() # [END query] print(content) filesystem_client.delete_file_system()
def upload_to_adls(directory, filename, file_chunk_size=1048576): service_client = DataLakeServiceClient.from_connection_string(os.environ['ADLS_CONNECTION_STRING']) file_system_client = service_client.get_file_system_client(file_system=os.environ['ADLS_FILE_SYSTEM_NAME']) directory_client = file_system_client.get_directory_client(directory) file_client = directory_client.create_file(filename) with open(filename, 'rb') as local_file: offset = 0 for file_chunk in iter(lambda: local_file.read(file_chunk_size), b""): chunk_size = len(file_chunk) file_client.append_data( file_chunk, offset=offset, length=chunk_size, validate_content=True) offset += chunk_size file_client.flush_data(offset)
def upload_dir_datalake_newfile(from_path: str, to_path: str, file_system_name: str = 'p4-data'): try: ser_cli = DataLakeServiceClient.from_connection_string( config.AZURE_STORAGE_CONNECTION_STRING) filesys_cli = ser_cli.get_file_system_client( file_system=file_system_name) dir_cli = filesys_cli.get_directory_client(to_path) # csv_files = glob.glob(f'{from_path}/*.txt', recursive=True) all_files = os.listdir(from_path) csv_files = [name for name in all_files if name.endswith('.csv')] for csv_f in csv_files: # print(csv_f) file_cli = dir_cli.get_file_client(csv_f) with open(os.path.join(from_path, csv_f), 'r') as f: file_cli.upload_data(f.read(), overwrite=True) except Exception as e: print(e)
def upload_file_datalake(filename: str, from_path: str, to_path: str, file_system_name: str = 'p4-data'): try: ser_cli = DataLakeServiceClient.from_connection_string( config.AZURE_STORAGE_CONNECTION_STRING) filesys_cli = ser_cli.get_file_system_client( file_system=file_system_name) dir_cli = filesys_cli.get_directory_client(to_path) # print(csv_f) file_cli = dir_cli.get_file_client(filename) filepath = os.path.join(from_path, filename) if not os.path.exists(filepath): time.sleep(3) with open(filepath, 'r') as f: file_cli.upload_data(f.read(), overwrite=True) except Exception as e: print(e)
def run_amiss(): req_body = request.json ## Get Data Lake Connection Ready dl_account = req_body['account_url'] dl_key = req_body['account_credential'] dl_container = req_body['container'] dl_suffix = "core.windows.net" dl_cnxn = "DefaultEndpointsProtocol=https;AccountName=" + dl_account + ";AccountKey=" + dl_key + ";EndpointSuffix=" + dl_suffix serv = DataLakeServiceClient.from_connection_string(conn_str = dl_cnxn) fs_client = serv.get_file_system_client(dl_container) ## Get task info task = req_body['task'] vcf_path = task['vcf_path'] cadd_snv_path = task['cadd_snv_path'] cadd_indel_path = task['cadd_indel_path'] ## Make Unique Session ID sessionid = datetime.now().strftime('%Y%m%d%H%M%S_') + str(uuid4()) ## Download Files dest_dir = '/app/amiss/output/' + sessionid + '/' #f'/app/amiss/output/{sessionid}/' for task_file in [vcf_path, cadd_snv_path, cadd_indel_path]: file_client = fs_client.get_file_client(task_file) task_file_path = os.path.basename(task_file) dest_path = os.path.dirname(os.path.join(dest_dir, task_file_path)) dest_path_file = os.path.join(dest_dir, task_file_path) os.makedirs(dest_path, exist_ok = True) with open(dest_path_file, 'wb') as local_file: file_client.download_file().readinto(local_file) ## Define environment variables rel_dir = 'output/' + sessionid + '/' os.environ['AMISS_SESSION_ID'] = sessionid os.environ['AMISS_SESSION_DIR'] = rel_dir os.environ['AMISS_VCF_FILENAME'] = rel_dir + os.path.basename(vcf_path) os.environ['AMISS_CADD_SNV_FILENAME'] = rel_dir + os.path.basename(cadd_snv_path) os.environ['AMISS_CADD_INDEL_FILENAME'] = rel_dir + os.path.basename(cadd_indel_path) amiss_cmd = ["/bin/sh", "run.sh"]#, # sessionid, # dest_dir, # dest_dir + os.path.basename(vcf_path), # dest_dir + os.path.basename(cadd_snv_path), # dest_dir + os.path.basename(cadd_indel_path)] amiss_pipe = subprocess.Popen(amiss_cmd, \ stdout=subprocess.PIPE, \ stderr=subprocess.PIPE) update_status(sessionid = sessionid, pid = amiss_pipe.pid, status = 'Submitted', \ pipe = amiss_pipe, message = '') ## TEMPORARY: Show that files have been downloaded # output_files = 0 # for base, dirs, files in os.walk(dest_dir): # for Files in files: # output_files += 1 # output = {'files downloaded': output_files, "sessionid": sessionid} # return jsonify({"response": output}) output = {'task': 'amiss', 'sessionid': sessionid, 'pid': amiss_pipe.pid, 'message': 'Task submitted successfully.'} return Response(json.dumps(output), 200, mimetype='application/json')
def _get_service_client_from_connection_string(self, connection_string: str): return DataLakeServiceClient.from_connection_string( conn_str=connection_string)
def data_lake_service_sample(self): # Instantiate a DataLakeServiceClient using a connection string # [START create_datalake_service_client] from azure.storage.filedatalake import DataLakeServiceClient datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string) # [END create_datalake_service_client] # Instantiate a DataLakeServiceClient Azure Identity credentials. # [START create_datalake_service_client_oauth] from azure.identity import ClientSecretCredential token_credential = ClientSecretCredential( self.active_directory_tenant_id, self.active_directory_application_id, self.active_directory_application_secret, ) datalake_service_client = DataLakeServiceClient( "https://{}.dfs.core.windows.net".format(self.account_name), credential=token_credential) # [END create_datalake_service_client_oauth] # get user delegation key # [START get_user_delegation_key] from datetime import datetime, timedelta user_delegation_key = datalake_service_client.get_user_delegation_key( datetime.utcnow(), datetime.utcnow() + timedelta(hours=1)) # [END get_user_delegation_key] # Create file systems # [START create_file_system_from_service_client] datalake_service_client.create_file_system("filesystem") # [END create_file_system_from_service_client] file_system_client = datalake_service_client.create_file_system( "anotherfilesystem") # List file systems # [START list_file_systems] file_systems = datalake_service_client.list_file_systems() for file_system in file_systems: print(file_system.name) # [END list_file_systems] # Get Clients from DataLakeServiceClient file_system_client = datalake_service_client.get_file_system_client( file_system_client.file_system_name) # [START get_directory_client_from_service_client] directory_client = datalake_service_client.get_directory_client( file_system_client.file_system_name, "mydirectory") # [END get_directory_client_from_service_client] # [START get_file_client_from_service_client] file_client = datalake_service_client.get_file_client( file_system_client.file_system_name, "myfile") # [END get_file_client_from_service_client] # Create file and set properties metadata = {'hello': 'world', 'number': '42'} from azure.storage.filedatalake import ContentSettings content_settings = ContentSettings(content_language='spanish', content_disposition='inline') file_client.create_file(content_settings=content_settings) file_client.set_metadata(metadata=metadata) file_props = file_client.get_file_properties() print(file_props.metadata) # Create file/directory and set properties directory_client.create_directory(content_settings=content_settings, metadata=metadata) dir_props = directory_client.get_directory_properties() print(dir_props.metadata) # Delete File Systems # [START delete_file_system_from_service_client] datalake_service_client.delete_file_system("filesystem") # [END delete_file_system_from_service_client] file_system_client.delete_file_system()
# The sample scripts are provided AS IS without warranty of any kind. Microsoft further disclaims all implied warranties including, without limitation, any implied warranties of merchantability or of fitness for a particular purpose. # The entire risk arising out of the use or performance of the sample scripts and documentation remains with you. # In no event shall Microsoft, its authors, owners of this repository or anyone else involved in the creation, production, or delivery of the scripts be liable for any damages whatsoever (including, # without limitation, damages for loss of business profits, business interruption, loss of business information, or other pecuniary loss) arising out of the use of or inability to use the sample scripts or documentation, even if Microsoft has been advised of the possibility of such damages #------------------------------------------------------------------------- #IMPORT THE LIBRARIES INTO YOUR FILE from azure.storage.filedatalake import DataLakeServiceClient from azure.storage.filedatalake._shared.base_client import create_configuration #OPTION 1 - MAKING USE OF CONNECTION STRING AND CREATING THE DATALAKE CLIENT connection_string = "PUT CONNECTION STRING HERE" #CREATE THE DATALAKE SERVICE CLIENT service_client = DataLakeServiceClient.from_connection_string( connection_string) ##OPTION 2 - MAKING USE OF ACCESS KEY AND CREATING THE DATALAKE CLIENT storage_account_key = "ACCESS KEY" storage_account_name = "ACCOUNT NAME" #CREATE THE DATALAKE SERVICE CLIENT service_client = DataLakeServiceClient( account_url="{}://{}.dfs.core.windows.net".format("https", storage_account_name), credential=storage_account_key) #PERFORM THE LISTING OPERATION file_systems = service_client.list_file_systems() for file_system in file_systems: print(file_system.name)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.datalake_service_client = DataLakeServiceClient.from_connection_string( self.connection_string)