def glob(self, path): """For a template path, return matching files""" adl_path = self._trim_filename(path) return [ 'adl://%s.azuredatalakestore.net/%s' % (self.store_name, s) for s in AzureDLFileSystem.glob(self, adl_path) ]
class AzureDatalakeFileSystem(AbstractFileSystem): """ Access Azure Datalake Gen1 as if it were a file system. This exposes a filesystem-like API on top of Azure Datalake Storage Parameters ----------- tenant_id: string Azure tenant, also known as the subscription id client_id: string The username or serivceprincipal id client_secret: string The access key store_name: string (optional) The name of the datalake account being accessed. Should be inferred from the urlpath if using with Dask read_xxx and to_xxx methods. Examples -------- >>> adl = AzureDatalakeFileSystem(tenant_id="xxxx", client_id="xxxx", ... client_secret="xxxx") >>> adl.ls('') Sharded Parquet & CSV files can be read as >>> storage_options = dict(tennant_id=TENNANT_ID, client_id=CLIENT_ID, ... client_secret=CLIENT_SECRET) # doctest: +SKIP >>> ddf = dd.read_parquet('adl://store_name/folder/filename.parquet', ... storage_options=storage_options) # doctest: +SKIP >>> ddf = dd.read_csv('adl://store_name/folder/*.csv' ... storage_options=storage_options) # doctest: +SKIP Sharded Parquet and CSV files can be written as >>> ddf.to_parquet("adl://store_name/folder/filename.parquet", ... storage_options=storage_options) # doctest: +SKIP >>> ddf.to_csv('adl://store_name/folder/*.csv' ... storage_options=storage_options) # doctest: +SKIP """ protocol = "adl" def __init__(self, tenant_id, client_id, client_secret, store_name): super().__init__() self.tenant_id = tenant_id self.client_id = client_id self.client_secret = client_secret self.store_name = store_name self.do_connect() @staticmethod def _get_kwargs_from_urls(paths): """ Get the store_name from the urlpath and pass to storage_options """ ops = infer_storage_options(paths) out = {} if ops.get("host", None): out["store_name"] = ops["host"] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) return ops["path"] def do_connect(self): """Establish connection object.""" token = lib.auth( tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.client_secret, ) self.azure_fs = AzureDLFileSystem(token=token, store_name=self.store_name) def ls(self, path, detail=False, invalidate_cache=True, **kwargs): files = self.azure_fs.ls( path=path, detail=detail, invalidate_cache=invalidate_cache ) for file in files: if "type" in file and file["type"] == "DIRECTORY": file["type"] = "directory" return files def info(self, path, invalidate_cache=True, expected_error_code=404, **kwargs): info = self.azure_fs.info( path=path, invalidate_cache=invalidate_cache, expected_error_code=expected_error_code, ) info["size"] = info["length"] return info def _trim_filename(self, fn, **kwargs): """ Determine what kind of filestore this is and return the path """ so = infer_storage_options(fn) fileparts = so["path"] return fileparts def glob(self, path, details=False, invalidate_cache=True, **kwargs): """For a template path, return matching files""" adlpaths = self._trim_filename(path) filepaths = self.azure_fs.glob( adlpaths, details=details, invalidate_cache=invalidate_cache ) return filepaths def isdir(self, path, **kwargs): """Is this entry directory-like?""" try: return self.info(path)["type"].lower() == "directory" except FileNotFoundError: return False def isfile(self, path, **kwargs): """Is this entry file-like?""" try: return self.azure_fs.info(path)["type"].lower() == "file" except Exception: return False def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs, ): return AzureDatalakeFile(self, path, mode=mode) def read_block(self, fn, offset, length, delimiter=None, **kwargs): return self.azure_fs.read_block(fn, offset, length, delimiter) def ukey(self, path): return tokenize(self.info(path)["modificationTime"]) def size(self, path): return self.info(path)["length"] def __getstate__(self): dic = self.__dict__.copy() logger.debug("Serialize with state: %s", dic) return dic def __setstate__(self, state): logger.debug("De-serialize with state: %s", state) self.__dict__.update(state) self.do_connect()