def __get_file_paths_from_folder(
            self, folder_path: str,
            file_system_client: FileSystemClient) -> List[PathProperties]:
        try:
            paths = list(file_system_client.get_paths(path=folder_path))
            unprocessed_files = []

            for path in paths:
                # Check for 0-bytes files: Can be ignored
                # - If a 0-byte file exists - then the ingress-api will send error code back to the adapter,
                #   which has responsibility of ingesting data again.
                if path.content_length == 0:
                    message = f'0-byte file skipped: {path.name}'
                    logger.warning(message)
                    continue

                processed_file_metadata = file_system_client.get_file_client(
                    path.name).get_file_properties().metadata

                # We check if data should be processed again
                if 'processed' not in processed_file_metadata:
                    unprocessed_files.append(path)
                elif self.last_backfill_start > self.__get_datetime_from_string(
                        processed_file_metadata['processed']):
                    unprocessed_files.append(path)

            return unprocessed_files
        except ResourceNotFoundError:
            return []
Ejemplo n.º 2
0
class AzureDataLakeGen2(object):
    
    def __init__(self):
        """
        [Constructor for Azure Data Lake Gen2 class, which instantiates a connection to an
        Azure storage account with Data Lake Gen2 mounted as the container]

        Arguments:
            dbutils {object} -- [Represents databricks utilities library for pyspark]
            client {object} -- [Represents the azure key vault client used to retrieve secrets]
        """
           
        self.account_name = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-account')
        self.account_key = dbutils.secrets.get(scope='databricks-secret-scope',key='datalake-key')
        self.account_url = "https://{0}.dfs.core.windows.net/".format(self.account_name)
        self.service_client = DataLakeServiceClient(account_url=self.account_url, credential=self.account_key)
        self.file_system_name = 'datalake'
        self.file_system = FileSystemClient(account_url=self.account_url,file_system_name=self.file_system_name, credential=self.account_key)
        
    def get_directory_list(self, directory: str):
        """
		[Lists all of the file contents of a Azure Data Lake Gen2 directory based on the path prefix. 
        Recursive is set when file contents of sub-directories are required.]

          Arguments:
              directory {str} -- [Directory name to use in order to retrieve files]
              recursive {str} -- [Option set to determine if all sub-directory information is also gathered]

          Returns:
              file_list -- [File list containing all of the paths for each file within the directory]
        """        
        try:
			logger.info('Listing the files from the given directory: {0}'.format(directory))
			file_list = self.file_system.get_paths(path=directory)
			if file_list is not None:
				return file_list
		except Exception as e:
			logger.error(str(e))
			raise(e)