def __get_paths_since_last_run(self, filesystem_client: FileSystemClient,
                                   max_files: int) -> List[PathProperties]:
        state = self.__retrieve_transformation_state(filesystem_client)

        last_successful_run = self.__get_datetime_from_string(
            state['last_successful_run'])
        if 'last_backfill_start' in state:
            self.last_backfill_start = self.__get_datetime_from_string(
                state['last_backfill_start'])

        now = datetime.utcnow()
        time_range = pd.date_range(last_successful_run, now, freq='H')
        paths = []

        for timeslot in time_range:
            folder_path = f'{self.guid}/year={timeslot.year}/month={timeslot.month:02d}/day={timeslot.day:02d}' + \
                          f'/hour={timeslot.hour:02d}'

            paths.extend(
                self.__get_file_paths_from_folder(folder_path,
                                                  filesystem_client))

            if len(paths) > max_files:
                break

        paths_return = paths[:max_files]

        for path in paths_return:
            # Set 'processing' tag when the file is getting processed
            metadata = {
                'processing': datetime.utcnow().strftime(self.DATE_FORMAT)
            }
            filesystem_client.get_file_client(path.name).set_metadata(metadata)

        return paths_return
    def __get_file_paths_from_folder(
            self, folder_path: str,
            file_system_client: FileSystemClient) -> List[PathProperties]:
        try:
            paths = list(file_system_client.get_paths(path=folder_path))
            unprocessed_files = []

            for path in paths:
                # Check for 0-bytes files: Can be ignored
                # - If a 0-byte file exists - then the ingress-api will send error code back to the adapter,
                #   which has responsibility of ingesting data again.
                if path.content_length == 0:
                    message = f'0-byte file skipped: {path.name}'
                    logger.warning(message)
                    continue

                processed_file_metadata = file_system_client.get_file_client(
                    path.name).get_file_properties().metadata

                # We check if data should be processed again
                if 'processed' not in processed_file_metadata:
                    unprocessed_files.append(path)
                elif self.last_backfill_start > self.__get_datetime_from_string(
                        processed_file_metadata['processed']):
                    unprocessed_files.append(path)

            return unprocessed_files
        except ResourceNotFoundError:
            return []
Example #3
0
    def test_file_sas_only_applies_to_file_level(self,
                                                 datalake_storage_account_name,
                                                 datalake_storage_account_key):
        self._setUp(datalake_storage_account_name,
                    datalake_storage_account_key)
        # SAS URL is calculated from storage key, so this test runs live only
        file_name = self._get_file_reference()
        directory_name = self._get_directory_reference()
        self._create_file_and_return_client(directory=directory_name,
                                            file=file_name)

        # generate a token with file level read and write permissions
        token = generate_file_sas(
            self.dsc.account_name,
            self.file_system_name,
            directory_name,
            file_name,
            self.dsc.credential.account_key,
            permission=FileSasPermissions(read=True, write=True),
            expiry=datetime.utcnow() + timedelta(hours=1),
        )

        # read the created file which is under root directory
        file_client = DataLakeFileClient(self.dsc.url,
                                         self.file_system_name,
                                         directory_name + '/' + file_name,
                                         credential=token)
        properties = file_client.get_file_properties()

        # make sure we can read the file properties
        self.assertIsNotNone(properties)

        # try to write to the created file with the token
        response = file_client.append_data(b"abcd",
                                           0,
                                           4,
                                           validate_content=True)
        self.assertIsNotNone(response)

        # the token is for file level, so users are not supposed to have access to file system level operations
        file_system_client = FileSystemClient(self.dsc.url,
                                              self.file_system_name,
                                              credential=token)
        with self.assertRaises(ClientAuthenticationError):
            file_system_client.get_file_system_properties()

        # the token is for file level, so users are not supposed to have access to directory level operations
        directory_client = DataLakeDirectoryClient(self.dsc.url,
                                                   self.file_system_name,
                                                   directory_name,
                                                   credential=token)
        with self.assertRaises(ClientAuthenticationError):
            directory_client.get_directory_properties()
Example #4
0
    def __init__(self):
        """
        [Constructor for Azure Data Lake Gen2 class, which instantiates a connection to an
        Azure storage account with Data Lake Gen2 mounted as the container]

        Arguments:
            dbutils {object} -- [Represents databricks utilities library for pyspark]
            client {object} -- [Represents the azure key vault client used to retrieve secrets]
        """
           
        self.account_name = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-account')
        self.account_key = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-key')
        self.account_url = "https://{0}.dfs.core.windows.net/".format(self.account_name)
        self.service_client = DataLakeServiceClient(account_url=self.account_url, credential=self.account_key)
        self.file_system_name = 'datalake'
        self.file_system = FileSystemClient(account_url=self.account_url,file_system_name=self.file_system_name, credential=self.account_key)
    def create_file_from_file_system(self):
        # [START create_file_system_client_from_connection_string]
        from azure.storage.filedatalake import FileSystemClient
        file_system_client = FileSystemClient.from_connection_string(
            self.connection_string, "filesystem")
        # [END create_file_system_client_from_connection_string]

        file_system_client.create_file_system()

        # [START create_directory_from_file_system]
        directory_client = file_system_client.create_directory("mydirectory")
        # [END create_directory_from_file_system]

        # [START create_file_from_file_system]
        file_client = file_system_client.create_file("myfile")
        # [END create_file_from_file_system]

        # [START delete_file_from_file_system]
        file_system_client.delete_file("myfile")
        # [END delete_file_from_file_system]

        # [START delete_directory_from_file_system]
        file_system_client.delete_directory("mydirectory")
        # [END delete_directory_from_file_system]

        file_system_client.delete_file_system()
Example #6
0
def save_to_adls(connection_string, file_system, directory_path, file_name,
                 data):
    file_system_client = FileSystemClient.from_connection_string(
        connection_string, file_system_name=file_system)
    directory_client = file_system_client.get_directory_client(directory_path)
    file_client = directory_client.create_file(file_name)
    file_client.append_data(data, 0, len(data))
    file_client.flush_data(len(data))
 def __retrieve_transformation_state(
         self, filesystem_client: FileSystemClient) -> Dict:
     with filesystem_client.get_file_client(
             f'{self.guid}/{self.STATE_FILE}') as file_client:
         try:
             state = file_client.download_file().readall()
             return json.loads(state)
         except ResourceNotFoundError:
             return {'last_successful_run': '2018-01-01T00:00:00Z'}
    def __get_file_paths(
            self, max_files: int,
            local_client_auth: ClientAuthorization) -> List[PathProperties]:
        with FileSystemClient(self.account_url,
                              self.filesystem_name,
                              credential=local_client_auth.get_credential_sync(
                              )) as filesystem_client:

            return self.__get_paths_since_last_run(filesystem_client,
                                                   max_files)
 def get_conn(self) -> FileSystemClient:
     """
     Return an Azure Data Lake Service Client object.
     :return: FileSystemClient
     """
     conn = self.get_connection(self.conn_id)
     file_system_client = FileSystemClient(
         account_url=f"https://{conn.login}.dfs.core.windows.net",
         file_system_name=self.container,
         credential=conn.password,
     )
     return file_system_client
    def batch_delete_files_or_empty_directories(self):
        from azure.storage.filedatalake import FileSystemClient
        file_system_client = FileSystemClient.from_connection_string(
            self.connection_string, "filesystem")

        file_system_client.create_file_system()

        data = b'hello world'

        try:
            # create file1
            file_system_client.get_file_client('file1').upload_data(
                data, overwrite=True)

            # create file2, then pass file properties in batch delete later
            file2 = file_system_client.get_file_client('file2')
            file2.upload_data(data, overwrite=True)
            file2_properties = file2.get_file_properties()

            # create file3 and batch delete it later only etag matches this file3 etag
            file3 = file_system_client.get_file_client('file3')
            file3.upload_data(data, overwrite=True)
            file3_etag = file3.get_file_properties().etag

            # create dir1. Empty directory can be deleted using delete_files
            file_system_client.get_directory_client('dir1').create_directory(),

            # create dir2, then pass directory properties in batch delete later
            dir2 = file_system_client.get_directory_client('dir2')
            dir2.create_directory()
            dir2_properties = dir2.get_directory_properties()
        except:
            pass

        # Act
        response = file_system_client.delete_files('file1',
                                                   file2_properties, {
                                                       'name': 'file3',
                                                       'etag': file3_etag
                                                   },
                                                   'dir1',
                                                   dir2_properties,
                                                   raise_on_any_failure=False)
        print("total number of sub-responses:" + len(response))
        print(response[0].status_code)
        print(response[2].status_code)
        print(response[3].status_code)
    def save_state(self):
        """
        Updates the transformation state file after a successful run. Its important this method gets called
        after the pipeline has run or else the datasource will keep processing already processed files.
        """
        local_client_auth = self.client_auth.get_local_copy()

        with FileSystemClient(self.account_url,
                              self.filesystem_name,
                              credential=local_client_auth.get_credential_sync(
                              )) as filesystem_client:
            state = self.__retrieve_transformation_state(filesystem_client)

            # state file doesn't exist. We create a fresh one.
            if not state:
                state = {}

            if len(self.file_paths) > 0:
                for path in self.file_paths:
                    # Set 'processed' tag in the metadata of the file
                    metadata = {
                        'processed':
                        datetime.utcnow().strftime(self.DATE_FORMAT)
                    }
                    filesystem_client.get_file_client(
                        path.name).set_metadata(metadata)

                # Get the date from the folder structure of the last file it has processed
                date_elements = self.file_paths[-1].name.split('/')[1:-1]
                date_str = ''.join([x.split('=')[1] for x in date_elements])

                latest_folder_date = datetime.strptime(
                    date_str, '%Y%m%d%H').strftime(self.DATE_FORMAT)
                state['last_successful_run'] = latest_folder_date
                state[
                    'last_backfill_start'] = self.last_backfill_start.strftime(
                        self.DATE_FORMAT)
                self.__save_transformation_state(filesystem_client, state)

        self.file_paths = []
Example #12
0
class AzureDataLakeGen2(object):
    
    def __init__(self):
        """
        [Constructor for Azure Data Lake Gen2 class, which instantiates a connection to an
        Azure storage account with Data Lake Gen2 mounted as the container]

        Arguments:
            dbutils {object} -- [Represents databricks utilities library for pyspark]
            client {object} -- [Represents the azure key vault client used to retrieve secrets]
        """
           
        self.account_name = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-account')
        self.account_key = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-key')
        self.account_url = "https://{0}.dfs.core.windows.net/".format(self.account_name)
        self.service_client = DataLakeServiceClient(account_url=self.account_url, credential=self.account_key)
        self.file_system_name = 'datalake'
        self.file_system = FileSystemClient(account_url=self.account_url,file_system_name=self.file_system_name, credential=self.account_key)
        
    def get_directory_list(self, directory: str):
        """
		[Lists all of the file contents of a Azure Data Lake Gen2 directory based on the path prefix. 
        Recursive is set when file contents of sub-directories are required.]

          Arguments:
              directory {str} -- [Directory name to use in order to retrieve files]
              recursive {str} -- [Option set to determine if all sub-directory information is also gathered]

          Returns:
              file_list -- [File list containing all of the paths for each file within the directory]
        """        
        try:
			logger.info('Listing the files from the given directory: {0}'.format(directory))
			file_list = self.file_system.get_paths(path=directory)
			if file_list is not None:
				return file_list
		except Exception as e:
			logger.error(str(e))
			raise(e)
from azure.storage.filedatalake import DataLakeFileClient, FileSystemClient

num_hours: int = 100

config: ConfigParser = ConfigParser()
config.read("config.ini")

start_date: datetime = datetime(2019, 10, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
assets: List[int] = [x for x in range (15820, 15960)]

connection_sting: str = config.get("default", "connection_string")
remote_path: str = "ingest-data"

client: FileSystemClient
with FileSystemClient.from_connection_string(connection_sting, "landing") as client:
    for hour_offset in range(num_hours):
        file_date: datetime = start_date + timedelta(hours=hour_offset)
        output_filename: str = f"events-{file_date.strftime('%Y%m%d_%H%M%S')}.csv"

        content: StringIO
        with StringIO() as content:
            csv_writer: csv.writer = csv.writer(content)
            csv_writer.writerow(["asset", "ts", "value", "comments"])
            data: List = []
            for i in range(3600):
                asset: int = random.choice(assets)
                value: float = random.random()
                event_date: datetime = file_date + timedelta(seconds=i)
                comment: str = f"This is a random comment for asset: {asset}" if i % 13 == 0 else ""
                data.append([asset, int(event_date.timestamp()), value, comment])
            # at this point we don't handle the exceptions, if there is any issue we will just log that information
            print (f"time: {datetime.now()}, radio: {radio_station['name']}, day: {current_day.strftime('%d-%m-%Y')}, hour: {hour['hour_from']} - {hour['hour_to']} - error")

    # report success on radio and day level
    print (f"time: {datetime.now()}, radio: {radio_station['name']}, day: {current_day.strftime('%d-%m-%Y')} - done")

    # save data on year basis - after parsing the first day of each month
    # or if that the last day of defined range, n == number of dats
    if n == number_of_days or (current_day.day == 1 and current_day.month == 1):

        # convert playlist to json
        data = json.dumps(playlist, indent=4, sort_keys=True, default=str)

        # specify file name and file path
        directory_path = f"{sa_parent_directory_name}/{radio_station['name'].replace(' ', '').strip()}" 
        current_file_name = f"{str(current_day.year)}.json"

        # connect to Azure data lake storage
        file_system_client = FileSystemClient.from_connection_string(sa_connection_string, file_system_name=sa_file_system)
        directory_client = file_system_client.get_directory_client(directory_path)

        # save data to file
        file_client = directory_client.create_file(current_file_name)
        file_client.append_data(data, 0, len(data))
        file_client.flush_data(len(data))

        # once the current data are saved, clear the list
        playlist = []

# report script success
print ("done")
def __get_filesystem_client(token: str) -> FileSystemClient:
    account_url = config['Azure Storage']['account_url']
    filesystem_name = config['Azure Storage']['filesystem_name']
    credential = AzureCredential(token)

    return FileSystemClient(account_url, filesystem_name, credential=credential)
 def __save_transformation_state(self, filesystem_client: FileSystemClient,
                                 state: Dict):
     with filesystem_client.get_file_client(
             f'{self.guid}/{self.STATE_FILE}') as file_client:
         json_data = json.dumps(state)
         file_client.upload_data(json_data, overwrite=True)
Example #17
0
 def _get_container_client(self, storage_account_url: str, file_system: str,
                           credential: Union[DefaultAzureCredential, str]):
     file_system = FileSystemClient(account_url=storage_account_url,
                                    file_system_name=file_system,
                                    credential=credential)
     return file_system