class WritableTextFilesADLSource(base.DataSource): """ A Azure Data Lake source that also implements a "write" method to write files to data lake. """ name = 'writableadltext' partition_access = True version = '0.0.1dev' def __init__(self, tenant_id, client_id, client_secret, store_name, metadata=None): token = lib.auth(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret) self.adl = AzureDLFileSystem(store_name=store_name, token=token) super(WritableTextFilesADLSource, self).__init__(metadata=metadata) def write(self, local_path, remote_path): print('writing') self.adl.put(local_path, remote_path) def read(self, path): with self.adl.open(path, 'rb') as f: data = f.read() print(data)
def do_connect(self): """Establish connection object.""" token = lib.auth( tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.client_secret, ) self.azure_fs = AzureDLFileSystem(token=token, store_name=self.store_name)
def second_azure(): from azure.datalake.store import AzureDLFileSystem fs = AzureDLFileSystem(token=settings.TOKEN, store_name=settings.STORE_NAME) # Clear filesystem cache to ensure we capture all requests from a test fs.invalidate_cache() yield fs
def __init__(self, tenant_id, client_id, client_secret, store_name, metadata=None): token = lib.auth(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret) self.adl = AzureDLFileSystem(store_name=store_name, token=token) super(WritableTextFilesADLSource, self).__init__(metadata=metadata)
def glob(self, path): """For a template path, return matching files""" adl_path = self._trim_filename(path) return [ 'adl://%s.azuredatalakestore.net/%s' % (self.store_name, s) for s in AzureDLFileSystem.glob(self, adl_path) ]
def __init__(self, tenant_id=None, client_id=None, client_secret=None, **kwargs): self.tenant_id = tenant_id self.client_id = client_id self.client_secret = client_secret self.kwargs = kwargs # self.kwargs['store_name'] = kwargs['host'] token = lib.auth( tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.client_secret, ) self.kwargs["token"] = token self.fs = AzureDLFileSystem(**self.kwargs)
class AzureDatalakeFileSystem(AbstractFileSystem): """ Access Azure Datalake Gen1 as if it were a file system. This exposes a filesystem-like API on top of Azure Datalake Storage Parameters ----------- tenant_id: string Azure tenant, also known as the subscription id client_id: string The username or serivceprincipal id client_secret: string The access key store_name: string (optional) The name of the datalake account being accessed. Should be inferred from the urlpath if using with Dask read_xxx and to_xxx methods. Examples -------- >>> adl = AzureDatalakeFileSystem(tenant_id="xxxx", client_id="xxxx", ... client_secret="xxxx") >>> adl.ls('') Sharded Parquet & CSV files can be read as >>> storage_options = dict(tennant_id=TENNANT_ID, client_id=CLIENT_ID, ... client_secret=CLIENT_SECRET) # doctest: +SKIP >>> ddf = dd.read_parquet('adl://store_name/folder/filename.parquet', ... storage_options=storage_options) # doctest: +SKIP >>> ddf = dd.read_csv('adl://store_name/folder/*.csv' ... storage_options=storage_options) # doctest: +SKIP Sharded Parquet and CSV files can be written as >>> ddf.to_parquet("adl://store_name/folder/filename.parquet", ... storage_options=storage_options) # doctest: +SKIP >>> ddf.to_csv('adl://store_name/folder/*.csv' ... storage_options=storage_options) # doctest: +SKIP """ protocol = "adl" def __init__(self, tenant_id, client_id, client_secret, store_name): super().__init__() self.tenant_id = tenant_id self.client_id = client_id self.client_secret = client_secret self.store_name = store_name self.do_connect() @staticmethod def _get_kwargs_from_urls(paths): """ Get the store_name from the urlpath and pass to storage_options """ ops = infer_storage_options(paths) out = {} if ops.get("host", None): out["store_name"] = ops["host"] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) return ops["path"] def do_connect(self): """Establish connection object.""" token = lib.auth( tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.client_secret, ) self.azure_fs = AzureDLFileSystem(token=token, store_name=self.store_name) def ls(self, path, detail=False, invalidate_cache=True, **kwargs): files = self.azure_fs.ls( path=path, detail=detail, invalidate_cache=invalidate_cache ) for file in files: if "type" in file and file["type"] == "DIRECTORY": file["type"] = "directory" return files def info(self, path, invalidate_cache=True, expected_error_code=404, **kwargs): info = self.azure_fs.info( path=path, invalidate_cache=invalidate_cache, expected_error_code=expected_error_code, ) info["size"] = info["length"] return info def _trim_filename(self, fn, **kwargs): """ Determine what kind of filestore this is and return the path """ so = infer_storage_options(fn) fileparts = so["path"] return fileparts def glob(self, path, details=False, invalidate_cache=True, **kwargs): """For a template path, return matching files""" adlpaths = self._trim_filename(path) filepaths = self.azure_fs.glob( adlpaths, details=details, invalidate_cache=invalidate_cache ) return filepaths def isdir(self, path, **kwargs): """Is this entry directory-like?""" try: return self.info(path)["type"].lower() == "directory" except FileNotFoundError: return False def isfile(self, path, **kwargs): """Is this entry file-like?""" try: return self.azure_fs.info(path)["type"].lower() == "file" except Exception: return False def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs, ): return AzureDatalakeFile(self, path, mode=mode) def read_block(self, fn, offset, length, delimiter=None, **kwargs): return self.azure_fs.read_block(fn, offset, length, delimiter) def ukey(self, path): return tokenize(self.info(path)["modificationTime"]) def size(self, path): return self.info(path)["length"] def __getstate__(self): dic = self.__dict__.copy() logger.debug("Serialize with state: %s", dic) return dic def __setstate__(self, state): logger.debug("De-serialize with state: %s", state) self.__dict__.update(state) self.do_connect()
def open(self, path, mode='rb'): adl_path = self._trim_filename(path) f = AzureDLFileSystem.open(self, adl_path, mode=mode) return f
def do_connect(self): token = lib.auth(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.client_secret) self.kwargs['token'] = token AzureDLFileSystem.__init__(self, **self.kwargs)
class AzureDataLakeFileSystem(FileSystemBase): fs_cls = AzureDLFileSystem scheme = "adl" is_remote = True supports_scheme = False def __init__(self, tenant_id=None, client_id=None, client_secret=None, **kwargs): self.tenant_id = tenant_id self.client_id = client_id self.client_secret = client_secret self.kwargs = kwargs # self.kwargs['store_name'] = kwargs['host'] token = lib.auth( tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.client_secret, ) self.kwargs["token"] = token self.fs = AzureDLFileSystem(**self.kwargs) def _parse_store_name(self, path): from drfs.path import RemotePath if not isinstance(path, RemotePath): path = RemotePath(path) store_name, path = path.hostname, path.path if store_name == "": raise ValueError( "Can't connect without store name. Please provide the path in the " "following form: 'adl://STORE_NAME/folder/file.extension'!") return store_name, path def _connect(self, path): self.fs.kwargs["store_name"], path = self._parse_store_name(path) self.fs.connect() return path def _add_store_name(self, p): from drfs.path import RemotePath parts = p.parts part0 = parts[0].split("/")[2] drv = parts[0].replace(part0, self.fs.kwargs["store_name"]) return RemotePath(drv, part0, *parts[1:]) def ls(self, path, *args, **kwargs): path = self._connect(path) return [ self._add_store_name(p) for p in super().ls(path, *args, **kwargs) ] def open(self, path, *args, **kwargs): path = self._connect(path) return super().open(path, *args, **kwargs) def exists(self, path, *args, **kwargs): path = self._connect(path) return super().exists(path, *args, **kwargs) def remove(self, path, *args, **kwargs): path = self._connect(path) return super().remove(path, *args, **kwargs) def mv(self, path, *args, **kwargs): path = self._connect(path) return super().mv(path, *args, **kwargs) def makedirs(self, path, *args, **kwargs): path = self._connect(path) return super().makedirs(path, *args, **kwargs) def rmdir(self, path, *args, **kwargs): path = self._connect(path) return super().rmdir(path, *args, **kwargs) def info(self, path, *args, **kwargs): path = self._connect(path) return super().info(path, *args, **kwargs) def walk(self, *args, **kwargs): arg0 = self._connect(args[0]) return [ self._add_store_name(p) for p in super().walk(arg0, *args[1:], **kwargs) ] def glob(self, *args, **kwargs): arg0 = self._connect(args[0]) return [ self._add_store_name(p) for p in super().glob(arg0, *args[1:], **kwargs) ]
def setup_env(request): home = working_dir() fs = AzureDLFileSystem(store_name=settings.STORE_NAME, token=settings.TOKEN) if settings.RECORD_MODE != 'none': if not fs.exists(home): fs.mkdir(home)