def __get_paths_since_last_run(self, filesystem_client: FileSystemClient, max_files: int) -> List[PathProperties]: state = self.__retrieve_transformation_state(filesystem_client) last_successful_run = self.__get_datetime_from_string( state['last_successful_run']) if 'last_backfill_start' in state: self.last_backfill_start = self.__get_datetime_from_string( state['last_backfill_start']) now = datetime.utcnow() time_range = pd.date_range(last_successful_run, now, freq='H') paths = [] for timeslot in time_range: folder_path = f'{self.guid}/year={timeslot.year}/month={timeslot.month:02d}/day={timeslot.day:02d}' + \ f'/hour={timeslot.hour:02d}' paths.extend( self.__get_file_paths_from_folder(folder_path, filesystem_client)) if len(paths) > max_files: break paths_return = paths[:max_files] for path in paths_return: # Set 'processing' tag when the file is getting processed metadata = { 'processing': datetime.utcnow().strftime(self.DATE_FORMAT) } filesystem_client.get_file_client(path.name).set_metadata(metadata) return paths_return
def __get_file_paths_from_folder( self, folder_path: str, file_system_client: FileSystemClient) -> List[PathProperties]: try: paths = list(file_system_client.get_paths(path=folder_path)) unprocessed_files = [] for path in paths: # Check for 0-bytes files: Can be ignored # - If a 0-byte file exists - then the ingress-api will send error code back to the adapter, # which has responsibility of ingesting data again. if path.content_length == 0: message = f'0-byte file skipped: {path.name}' logger.warning(message) continue processed_file_metadata = file_system_client.get_file_client( path.name).get_file_properties().metadata # We check if data should be processed again if 'processed' not in processed_file_metadata: unprocessed_files.append(path) elif self.last_backfill_start > self.__get_datetime_from_string( processed_file_metadata['processed']): unprocessed_files.append(path) return unprocessed_files except ResourceNotFoundError: return []
def test_file_sas_only_applies_to_file_level(self, datalake_storage_account_name, datalake_storage_account_key): self._setUp(datalake_storage_account_name, datalake_storage_account_key) # SAS URL is calculated from storage key, so this test runs live only file_name = self._get_file_reference() directory_name = self._get_directory_reference() self._create_file_and_return_client(directory=directory_name, file=file_name) # generate a token with file level read and write permissions token = generate_file_sas( self.dsc.account_name, self.file_system_name, directory_name, file_name, self.dsc.credential.account_key, permission=FileSasPermissions(read=True, write=True), expiry=datetime.utcnow() + timedelta(hours=1), ) # read the created file which is under root directory file_client = DataLakeFileClient(self.dsc.url, self.file_system_name, directory_name + '/' + file_name, credential=token) properties = file_client.get_file_properties() # make sure we can read the file properties self.assertIsNotNone(properties) # try to write to the created file with the token response = file_client.append_data(b"abcd", 0, 4, validate_content=True) self.assertIsNotNone(response) # the token is for file level, so users are not supposed to have access to file system level operations file_system_client = FileSystemClient(self.dsc.url, self.file_system_name, credential=token) with self.assertRaises(ClientAuthenticationError): file_system_client.get_file_system_properties() # the token is for file level, so users are not supposed to have access to directory level operations directory_client = DataLakeDirectoryClient(self.dsc.url, self.file_system_name, directory_name, credential=token) with self.assertRaises(ClientAuthenticationError): directory_client.get_directory_properties()
def __init__(self): """ [Constructor for Azure Data Lake Gen2 class, which instantiates a connection to an Azure storage account with Data Lake Gen2 mounted as the container] Arguments: dbutils {object} -- [Represents databricks utilities library for pyspark] client {object} -- [Represents the azure key vault client used to retrieve secrets] """ self.account_name = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-account') self.account_key = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-key') self.account_url = "https://{0}.dfs.core.windows.net/".format(self.account_name) self.service_client = DataLakeServiceClient(account_url=self.account_url, credential=self.account_key) self.file_system_name = 'datalake' self.file_system = FileSystemClient(account_url=self.account_url,file_system_name=self.file_system_name, credential=self.account_key)
def create_file_from_file_system(self): # [START create_file_system_client_from_connection_string] from azure.storage.filedatalake import FileSystemClient file_system_client = FileSystemClient.from_connection_string( self.connection_string, "filesystem") # [END create_file_system_client_from_connection_string] file_system_client.create_file_system() # [START create_directory_from_file_system] directory_client = file_system_client.create_directory("mydirectory") # [END create_directory_from_file_system] # [START create_file_from_file_system] file_client = file_system_client.create_file("myfile") # [END create_file_from_file_system] # [START delete_file_from_file_system] file_system_client.delete_file("myfile") # [END delete_file_from_file_system] # [START delete_directory_from_file_system] file_system_client.delete_directory("mydirectory") # [END delete_directory_from_file_system] file_system_client.delete_file_system()
def save_to_adls(connection_string, file_system, directory_path, file_name, data): file_system_client = FileSystemClient.from_connection_string( connection_string, file_system_name=file_system) directory_client = file_system_client.get_directory_client(directory_path) file_client = directory_client.create_file(file_name) file_client.append_data(data, 0, len(data)) file_client.flush_data(len(data))
def __retrieve_transformation_state( self, filesystem_client: FileSystemClient) -> Dict: with filesystem_client.get_file_client( f'{self.guid}/{self.STATE_FILE}') as file_client: try: state = file_client.download_file().readall() return json.loads(state) except ResourceNotFoundError: return {'last_successful_run': '2018-01-01T00:00:00Z'}
def __get_file_paths( self, max_files: int, local_client_auth: ClientAuthorization) -> List[PathProperties]: with FileSystemClient(self.account_url, self.filesystem_name, credential=local_client_auth.get_credential_sync( )) as filesystem_client: return self.__get_paths_since_last_run(filesystem_client, max_files)
def get_conn(self) -> FileSystemClient: """ Return an Azure Data Lake Service Client object. :return: FileSystemClient """ conn = self.get_connection(self.conn_id) file_system_client = FileSystemClient( account_url=f"https://{conn.login}.dfs.core.windows.net", file_system_name=self.container, credential=conn.password, ) return file_system_client
def batch_delete_files_or_empty_directories(self): from azure.storage.filedatalake import FileSystemClient file_system_client = FileSystemClient.from_connection_string( self.connection_string, "filesystem") file_system_client.create_file_system() data = b'hello world' try: # create file1 file_system_client.get_file_client('file1').upload_data( data, overwrite=True) # create file2, then pass file properties in batch delete later file2 = file_system_client.get_file_client('file2') file2.upload_data(data, overwrite=True) file2_properties = file2.get_file_properties() # create file3 and batch delete it later only etag matches this file3 etag file3 = file_system_client.get_file_client('file3') file3.upload_data(data, overwrite=True) file3_etag = file3.get_file_properties().etag # create dir1. Empty directory can be deleted using delete_files file_system_client.get_directory_client('dir1').create_directory(), # create dir2, then pass directory properties in batch delete later dir2 = file_system_client.get_directory_client('dir2') dir2.create_directory() dir2_properties = dir2.get_directory_properties() except: pass # Act response = file_system_client.delete_files('file1', file2_properties, { 'name': 'file3', 'etag': file3_etag }, 'dir1', dir2_properties, raise_on_any_failure=False) print("total number of sub-responses:" + len(response)) print(response[0].status_code) print(response[2].status_code) print(response[3].status_code)
def save_state(self): """ Updates the transformation state file after a successful run. Its important this method gets called after the pipeline has run or else the datasource will keep processing already processed files. """ local_client_auth = self.client_auth.get_local_copy() with FileSystemClient(self.account_url, self.filesystem_name, credential=local_client_auth.get_credential_sync( )) as filesystem_client: state = self.__retrieve_transformation_state(filesystem_client) # state file doesn't exist. We create a fresh one. if not state: state = {} if len(self.file_paths) > 0: for path in self.file_paths: # Set 'processed' tag in the metadata of the file metadata = { 'processed': datetime.utcnow().strftime(self.DATE_FORMAT) } filesystem_client.get_file_client( path.name).set_metadata(metadata) # Get the date from the folder structure of the last file it has processed date_elements = self.file_paths[-1].name.split('/')[1:-1] date_str = ''.join([x.split('=')[1] for x in date_elements]) latest_folder_date = datetime.strptime( date_str, '%Y%m%d%H').strftime(self.DATE_FORMAT) state['last_successful_run'] = latest_folder_date state[ 'last_backfill_start'] = self.last_backfill_start.strftime( self.DATE_FORMAT) self.__save_transformation_state(filesystem_client, state) self.file_paths = []
class AzureDataLakeGen2(object): def __init__(self): """ [Constructor for Azure Data Lake Gen2 class, which instantiates a connection to an Azure storage account with Data Lake Gen2 mounted as the container] Arguments: dbutils {object} -- [Represents databricks utilities library for pyspark] client {object} -- [Represents the azure key vault client used to retrieve secrets] """ self.account_name = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-account') self.account_key = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-key') self.account_url = "https://{0}.dfs.core.windows.net/".format(self.account_name) self.service_client = DataLakeServiceClient(account_url=self.account_url, credential=self.account_key) self.file_system_name = 'datalake' self.file_system = FileSystemClient(account_url=self.account_url,file_system_name=self.file_system_name, credential=self.account_key) def get_directory_list(self, directory: str): """ [Lists all of the file contents of a Azure Data Lake Gen2 directory based on the path prefix. Recursive is set when file contents of sub-directories are required.] Arguments: directory {str} -- [Directory name to use in order to retrieve files] recursive {str} -- [Option set to determine if all sub-directory information is also gathered] Returns: file_list -- [File list containing all of the paths for each file within the directory] """ try: logger.info('Listing the files from the given directory: {0}'.format(directory)) file_list = self.file_system.get_paths(path=directory) if file_list is not None: return file_list except Exception as e: logger.error(str(e)) raise(e)
from azure.storage.filedatalake import DataLakeFileClient, FileSystemClient num_hours: int = 100 config: ConfigParser = ConfigParser() config.read("config.ini") start_date: datetime = datetime(2019, 10, 1, 0, 0, 0, 0, tzinfo=timezone.utc) assets: List[int] = [x for x in range (15820, 15960)] connection_sting: str = config.get("default", "connection_string") remote_path: str = "ingest-data" client: FileSystemClient with FileSystemClient.from_connection_string(connection_sting, "landing") as client: for hour_offset in range(num_hours): file_date: datetime = start_date + timedelta(hours=hour_offset) output_filename: str = f"events-{file_date.strftime('%Y%m%d_%H%M%S')}.csv" content: StringIO with StringIO() as content: csv_writer: csv.writer = csv.writer(content) csv_writer.writerow(["asset", "ts", "value", "comments"]) data: List = [] for i in range(3600): asset: int = random.choice(assets) value: float = random.random() event_date: datetime = file_date + timedelta(seconds=i) comment: str = f"This is a random comment for asset: {asset}" if i % 13 == 0 else "" data.append([asset, int(event_date.timestamp()), value, comment])
# at this point we don't handle the exceptions, if there is any issue we will just log that information print (f"time: {datetime.now()}, radio: {radio_station['name']}, day: {current_day.strftime('%d-%m-%Y')}, hour: {hour['hour_from']} - {hour['hour_to']} - error") # report success on radio and day level print (f"time: {datetime.now()}, radio: {radio_station['name']}, day: {current_day.strftime('%d-%m-%Y')} - done") # save data on year basis - after parsing the first day of each month # or if that the last day of defined range, n == number of dats if n == number_of_days or (current_day.day == 1 and current_day.month == 1): # convert playlist to json data = json.dumps(playlist, indent=4, sort_keys=True, default=str) # specify file name and file path directory_path = f"{sa_parent_directory_name}/{radio_station['name'].replace(' ', '').strip()}" current_file_name = f"{str(current_day.year)}.json" # connect to Azure data lake storage file_system_client = FileSystemClient.from_connection_string(sa_connection_string, file_system_name=sa_file_system) directory_client = file_system_client.get_directory_client(directory_path) # save data to file file_client = directory_client.create_file(current_file_name) file_client.append_data(data, 0, len(data)) file_client.flush_data(len(data)) # once the current data are saved, clear the list playlist = [] # report script success print ("done")
def __get_filesystem_client(token: str) -> FileSystemClient: account_url = config['Azure Storage']['account_url'] filesystem_name = config['Azure Storage']['filesystem_name'] credential = AzureCredential(token) return FileSystemClient(account_url, filesystem_name, credential=credential)
def __save_transformation_state(self, filesystem_client: FileSystemClient, state: Dict): with filesystem_client.get_file_client( f'{self.guid}/{self.STATE_FILE}') as file_client: json_data = json.dumps(state) file_client.upload_data(json_data, overwrite=True)
def _get_container_client(self, storage_account_url: str, file_system: str, credential: Union[DefaultAzureCredential, str]): file_system = FileSystemClient(account_url=storage_account_url, file_system_name=file_system, credential=credential) return file_system