Exemple #1
0
    def __init__(
        self,
        simple_links=True,
        block_size=None,
        same_scheme=True,
        size_policy=None,
        cache_type="bytes",
        cache_options=None,
        asynchronous=False,
        loop=None,
        client_kwargs=None,
        **storage_options,
    ):
        """
        NB: if this is called async, you must await set_client

        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        simple_links: bool
            If True, will consider both HTML <a> tags and anything that looks
            like a URL; if False, will consider only the former.
        same_scheme: True
            When doing ls/glob, if this is True, only consider paths that have
            http/https matching the input URLs.
        size_policy: this argument is deprecated
        client_kwargs: dict
            Passed to aiohttp.ClientSession, see
            https://docs.aiohttp.org/en/stable/client_reference.html
            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
        storage_options: key-value
            Any other parameters passed on to requests
        cache_type, cache_options: defaults used in open
        """
        super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
        self.simple_links = simple_links
        self.same_schema = same_scheme
        self.cache_type = cache_type
        self.cache_options = cache_options
        self.client_kwargs = client_kwargs or {}
        self.kwargs = storage_options
        self._session = None

        # Clean caching-related parameters from `storage_options`
        # before propagating them as `request_options` through `self.kwargs`.
        # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
        #       it clearer.
        request_options = copy(storage_options)
        self.use_listings_cache = request_options.pop("use_listings_cache", False)
        request_options.pop("listings_expiry_time", None)
        request_options.pop("max_paths", None)
        request_options.pop("skip_instance_cache", None)
        self.kwargs = request_options

        if not asynchronous:
            sync(self.loop, self.set_session)
Exemple #2
0
 def close_session(loop, session):
     if loop is not None and session is not None:
         if loop.is_running():
             try:
                 sync(loop, session.close, timeout=0.1)
             except fsspec.FSTimeoutError:
                 pass
         else:
             pass
Exemple #3
0
 def close_session(loop, session):
     if loop is not None and loop.is_running():
         try:
             sync(loop, session.close, timeout=0.1)
             return
         except (TimeoutError, FSTimeoutError):
             pass
     if session._connector is not None:
         # close after loop is dead
         session._connector._close()
Exemple #4
0
def test_write_small_secure(s3):
    # Unfortunately moto does not yet support enforcing SSE policies.  It also
    # does not return the correct objects that can be used to test the results
    # effectively.
    # This test is left as a placeholder in case moto eventually supports this.
    sse_params = SSEParams(server_side_encryption='aws:kms')
    with s3.open(secure_bucket_name + '/test', 'wb', writer_kwargs=sse_params) as f:
        f.write(b'hello')
    assert s3.cat(secure_bucket_name + '/test') == b'hello'
    sync(s3.loop, s3.s3.head_object, Bucket=secure_bucket_name, Key='test')
 def __init__(
     self,
     fs,
     url,
     mode="rb",
     asynchronous=False,
     session=None,
     loop=None,
     **kwargs
 ):
     path = fs._strip_protocol(url)
     url = URL(fs.webdav_url) / path
     self.url = url.as_uri()
     self.details = {"name": self.url, "size": None}
     self.asynchronous = asynchronous
     self.session = session
     self.loop = loop
     super(HTTPStreamFile, self).__init__(
         fs=fs,
         path=path,
         mode=mode,
         block_size=0,
         cache_type="none",
         cache_options={},
         **kwargs)
     if self.mode == "rb":
         self.r = sync(self.loop, self.session.get, self.url, **self.kwargs)
     elif self.mode == "wb":
         pass
     else:
         raise ValueError
Exemple #6
0
 def _simple_upload(self):
     """One-shot upload, less than 5MB"""
     self.buffer.seek(0)
     data = self.buffer.read()
     sync(
         self.gcsfs.loop,
         simple_upload,
         self.gcsfs,
         self.bucket,
         self.key,
         data,
         self.metadata,
         self.consistency,
         self.content_type,
         timeout=self.timeout,
     )
    def __init__(
        self,
        api_url=None,
        webdav_url=None,
        username=None,
        password=None,
        token=None,
        block_size=None,
        asynchronous=False,
        loop=None,
        client_kwargs=None,
        **storage_options
    ):
        """
        NB: if this is called async, you must await set_client

        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        client_kwargs: dict
            Passed to aiohttp.ClientSession, see
            https://docs.aiohttp.org/en/stable/client_reference.html
            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
        storage_options: key-value
            Any other parameters passed on to requests
        """
        super().__init__(
            self,
            asynchronous=asynchronous,
            loop=loop,
            **storage_options
        )
        self.api_url = api_url
        self.webdav_url = webdav_url
        self.client_kwargs = client_kwargs or {}
        if (username is None) ^ (password is None):
            raise ValueError('Username or password not provided')
        if (username is not None) and (password is not None):
            self.client_kwargs.update(
                auth=aiohttp.BasicAuth(username, password)
            )
        if token is not None:
            if password is not None:
                raise ValueError('Provide either token or username/password')
            headers = self.client_kwargs.get('headers', {})
            headers.update(Authorization=f'Bearer {token}')
            self.client_kwargs.update(headers=headers)
        block_size = DEFAULT_BLOCK_SIZE if block_size is None else block_size
        self.block_size = block_size
        self.kwargs = storage_options
        if not asynchronous:
            self._session = sync(self.loop, get_client, **self.client_kwargs)
            weakref.finalize(self, sync, self.loop, self.session.close)
        else:
            self._session = None
 def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
     self.asynchronous = kwargs.pop("asynchronous", False)
     self.url = url
     self.loop = loop
     self.session = session
     if mode != "rb":
         raise ValueError
     self.details = {"name": url, "size": None}
     super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
     self.r = sync(self.loop, get, self.session, url, **kwargs)
 def write(self, data):
     if self.mode != "wb":
         raise ValueError("File not in write mode")
     self.r = sync(
         self.loop,
         self.session.put,
         self.url,
         data=data,
         **self.kwargs
     )
     self.r.raise_for_status()
Exemple #10
0
 def _initiate_upload(self):
     """ Create multi-upload """
     self.location = sync(
         self.gcsfs.loop,
         initiate_upload,
         self.gcsfs,
         self.bucket,
         self.key,
         self.content_type,
         self.metadata,
         timeout=self.timeout,
     )
Exemple #11
0
    def _open(
        self,
        path,
        mode="rb",
        block_size=None,
        autocommit=None,  # XXX: This differs from the base class.
        cache_type=None,
        cache_options=None,
        size=None,
        **kwargs,
    ):
        """Make a file-like object

        Parameters
        ----------
        path: str
            Full URL with protocol
        mode: string
            must be "rb"
        block_size: int or None
            Bytes to download in one request; use instance value if None. If
            zero, will return a streaming Requests file-like instance.
        kwargs: key-value
            Any other parameters, passed to requests calls
        """
        if mode != "rb":
            raise NotImplementedError
        block_size = block_size if block_size is not None else self.block_size
        kw = self.kwargs.copy()
        kw["asynchronous"] = self.asynchronous
        kw.update(kwargs)
        size = size or self.size(path)
        session = sync(self.loop, self.set_session)
        if block_size and size:
            return HTTPFile(
                self,
                path,
                session=session,
                block_size=block_size,
                mode=mode,
                size=size,
                cache_type=cache_type or self.cache_type,
                cache_options=cache_options or self.cache_options,
                loop=self.loop,
                **kw,
            )
        else:
            return HTTPStreamFile(self,
                                  path,
                                  mode=mode,
                                  loop=self.loop,
                                  session=session,
                                  **kw)
Exemple #12
0
    def __init__(
        self,
        simple_links=True,
        block_size=None,
        same_scheme=True,
        size_policy=None,
        cache_type="bytes",
        cache_options=None,
        asynchronous=False,
        loop=None,
        client_kwargs=None,
        **storage_options,
    ):
        """
        NB: if this is called async, you must await set_client

        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        simple_links: bool
            If True, will consider both HTML <a> tags and anything that looks
            like a URL; if False, will consider only the former.
        same_scheme: True
            When doing ls/glob, if this is True, only consider paths that have
            http/https matching the input URLs.
        size_policy: this argument is deprecated
        client_kwargs: dict
            Passed to aiohttp.ClientSession, see
            https://docs.aiohttp.org/en/stable/client_reference.html
            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
        storage_options: key-value
            Any other parameters passed on to requests
        cache_type, cache_options: defaults used in open
        """
        super().__init__(self,
                         asynchronous=asynchronous,
                         loop=loop,
                         **storage_options)
        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
        self.simple_links = simple_links
        self.same_schema = same_scheme
        self.cache_type = cache_type
        self.cache_options = cache_options
        self.client_kwargs = client_kwargs or {}
        self.kwargs = storage_options
        if not asynchronous:
            self._session = sync(self.loop, get_client, **self.client_kwargs)
            weakref.finalize(self, sync, self.loop, self.session.close)
        else:
            self._session = None
Exemple #13
0
    def __init__(
        self,
        project=DEFAULT_PROJECT,
        access="full_control",
        token=None,
        block_size=None,
        consistency="none",
        cache_timeout=None,
        secure_serialize=True,
        check_connection=False,
        requests_timeout=None,
        requester_pays=False,
        asynchronous=False,
        session_kwargs=None,
        loop=None,
        timeout=None,
        **kwargs,
    ):
        super().__init__(
            self,
            listings_expiry_time=cache_timeout,
            asynchronous=asynchronous,
            loop=loop,
            **kwargs,
        )
        if access not in self.scopes:
            raise ValueError("access must be one of {}", self.scopes)
        if project is None:
            warnings.warn(
                "GCS project not set - cannot list or create buckets")
        if block_size is not None:
            self.default_block_size = block_size
        self.requester_pays = requester_pays
        self.consistency = consistency
        self.cache_timeout = cache_timeout or kwargs.pop(
            "listings_expiry_time", None)
        self.requests_timeout = requests_timeout
        self.timeout = timeout
        self._session = None
        self.session_kwargs = session_kwargs or {}

        self.credentials = GoogleCredentials(project, access, token,
                                             check_connection)

        if not self.asynchronous:
            self._session = sync(self.loop,
                                 get_client,
                                 timeout=self.timeout,
                                 **self.session_kwargs)
            weakref.finalize(self, self.close_session, self.loop,
                             self._session)
Exemple #14
0
def test_xattr(s3):
    bucket, key = (test_bucket_name, 'tmp/test/xattr')
    filename = bucket + '/' + key
    body = b'aaaa'
    public_read_acl = {'Permission': 'READ', 'Grantee': {
        'URI': 'http://acs.amazonaws.com/groups/global/AllUsers', 'Type': 'Group'}}

    sync(s3.loop, s3.s3.put_object, Bucket=bucket, Key=key,
         ACL='public-read',
         Metadata=test_xattr_sample_metadata,
         Body=body)

    # save etag for later
    etag = s3.info(filename)['ETag']
    assert public_read_acl in sync(
        s3.loop, s3.s3.get_object_acl,
        Bucket=bucket, Key=key
    )['Grants']

    assert s3.getxattr(
        filename, 'test_xattr') == test_xattr_sample_metadata['test_xattr']
    assert s3.metadata(filename) == {'test-xattr': '1'}  # note _ became -

    s3file = s3.open(filename)
    assert s3file.getxattr(
        'test_xattr') == test_xattr_sample_metadata['test_xattr']
    assert s3file.metadata() == {'test-xattr': '1'}  # note _ became -

    s3file.setxattr(test_xattr='2')
    assert s3file.getxattr('test_xattr') == '2'
    s3file.setxattr(**{'test_xattr': None})
    assert s3file.metadata() == {}
    assert s3.cat(filename) == body

    # check that ACL and ETag are preserved after updating metadata
    assert public_read_acl in sync(s3.loop, s3.s3.get_object_acl,
        Bucket=bucket, Key=key)['Grants']
    assert s3.info(filename)['ETag'] == etag
Exemple #15
0
    def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
        self.asynchronous = kwargs.pop("asynchronous", False)
        self.url = url
        self.loop = loop
        self.session = session
        if mode != "rb":
            raise ValueError
        self.details = {"name": url, "size": None}
        super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)

        async def cor():
            r = await self.session.get(url, **kwargs).__aenter__()
            self.fs._raise_not_found_for_status(r, url)
            return r

        self.r = sync(self.loop, cor)
Exemple #16
0
def test_copy_managed(s3):
    data = b'abc' * 12*2**20
    fn = test_bucket_name + '/test/biggerfile'
    with s3.open(fn, 'wb') as f:
        f.write(data)
    sync(s3.loop, s3._copy_managed, fn, fn + '2', size=len(data), block=5 * 2 ** 20)
    assert s3.cat(fn) == s3.cat(fn + '2')
    with pytest.raises(ValueError):
        sync(s3.loop, s3._copy_managed, fn, fn + '3', size=len(data), block=4 * 2 ** 20)
    with pytest.raises(ValueError):
        sync(s3.loop, s3._copy_managed, fn, fn + '3', size=len(data), block=6 * 2 ** 30)
Exemple #17
0
    def __init__(self,
                 simple_links=True,
                 block_size=None,
                 same_scheme=True,
                 size_policy=None,
                 cache_type="bytes",
                 cache_options=None,
                 asynchronous=False,
                 loop=None,
                 **storage_options):
        """
        NB: if this is called async, you must await set_client

        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        simple_links: bool
            If True, will consider both HTML <a> tags and anything that looks
            like a URL; if False, will consider only the former.
        same_scheme: True
            When doing ls/glob, if this is True, only consider paths that have
            http/https matching the input URLs.
        size_policy: this argument is deprecated
        storage_options: key-value
            May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
            other parameters passed on to requests
        cache_type, cache_options: defaults used in open
        """
        super().__init__(self,
                         asynchronous=asynchronous,
                         loop=loop,
                         **storage_options)
        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
        self.simple_links = simple_links
        self.same_schema = same_scheme
        self.cache_type = cache_type
        self.cache_options = cache_options
        self.kwargs = storage_options
        if not asynchronous:
            self._session = sync(self.loop, get_client)
            weakref.finalize(self, sync, self.loop, self.session.close)
        else:
            self._session = None
Exemple #18
0
    def _directory_model_from_path(self, path, content=False):
        def s3_detail_to_model(s3_detail):
            model_path = s3_detail["Key"]
            model = base_model(self.fs.unprefix(model_path))
            if s3_detail["StorageClass"] == 'DIRECTORY':
                model["created"] = model["last_modified"] = DUMMY_CREATED_DATE
                model["type"] = "directory"
                lstat = self.fs.lstat(model_path)
                if "ST_MTIME" in lstat and lstat["ST_MTIME"]:
                    model["last_modified"] = model["created"] = lstat[
                        "ST_MTIME"]
            else:
                model["last_modified"] = s3_detail.get("LastModified").replace(
                    microsecond=0, tzinfo=tzutc())
                model["created"] = model["last_modified"]
                # model["size"] = s3_detail.get("Size")
                model["type"] = "notebook" if model_path.endswith(
                    ".ipynb") else "file"
            return model

        self.log.debug(
            "S3contents.GenericManager._directory_model_from_path: path('%s') type(%s)",
            path,
            content,
        )
        model = base_directory_model(path)
        if self.fs.isdir(path):
            lstat = self.fs.lstat(path)
            if "ST_MTIME" in lstat and lstat["ST_MTIME"]:
                model["last_modified"] = model["created"] = lstat["ST_MTIME"]
        if content:
            if not self.dir_exists(path):
                self.no_such_entity(path)
            model["format"] = "json"
            prefixed_path = self.fs.path(path)
            files_s3_detail = sync(self.fs.fs.loop, self.fs.fs._lsdir,
                                   prefixed_path)
            filtered_files_s3_detail = list(
                filter(
                    lambda detail: os.path.basename(detail['Key']) != self.fs.
                    dir_keep_file, files_s3_detail))
            model["content"] = list(
                map(s3_detail_to_model, filtered_files_s3_detail))
        return model
Exemple #19
0
 def _close_session(looplocal):
     loop = getattr(looplocal, "loop", None)
     session = getattr(looplocal, "_session", None)
     if loop is not None and session is not None:
         sync(loop, session.close)
Exemple #20
0
 def buckets(self):
     """Return list of available project buckets."""
     return [
         b["name"]
         for b in sync(self.loop, self._list_buckets, timeout=self.timeout)
     ]
    def __init__(
        self,
        simple_links=True,
        block_size=None,
        same_scheme=True,
        size_policy=None,
        cache_type="bytes",
        cache_options=None,
        asynchronous=False,
        loop=None,
        client_kwargs=None,
        get_client=get_client,
        **storage_options,
    ):
        """
        NB: if this is called async, you must await set_client

        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        simple_links: bool
            If True, will consider both HTML <a> tags and anything that looks
            like a URL; if False, will consider only the former.
        same_scheme: True
            When doing ls/glob, if this is True, only consider paths that have
            http/https matching the input URLs.
        size_policy: this argument is deprecated
        client_kwargs: dict
            Passed to aiohttp.ClientSession, see
            https://docs.aiohttp.org/en/stable/client_reference.html
            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
        get_client: Callable[..., aiohttp.ClientSession]
            A callable which takes keyword arguments and constructs
            an aiohttp.ClientSession. It's state will be managed by
            the HTTPFileSystem class.
        storage_options: key-value
            Any other parameters passed on to requests
        cache_type, cache_options: defaults used in open
        """
        super().__init__(
            simple_links=simple_links,
            block_size=block_size,
            same_scheme=same_scheme,
            size_policy=size_policy,
            cache_type=cache_type,
            cache_options=cache_options,
            asynchronous=asynchronous,
            loop=loop,
            client_kwargs=client_kwargs,
            get_client=get_client,
            **storage_options,
        )
        request_options = copy(storage_options)
        self.use_listings_cache = request_options.pop("use_listings_cache", False)
        request_options.pop("listings_expiry_time", None)
        request_options.pop("max_paths", None)
        request_options.pop("skip_instance_cache", None)
        listings_cache_type = request_options.pop("listings_cache_type", None)
        listings_cache_location = request_options.pop("listings_cache_location", None)

        if self.use_listings_cache:
            if listings_cache_type == "filedircache":
                logger.info(f"Dircache located at {listings_cache_location}")

        self.kwargs = request_options

        if not asynchronous:
            sync(self.loop, self.set_session)
Exemple #22
0
 def close_session(loop, session):
     if loop is not None and loop.is_running():
         sync(loop, session.close)
     elif session._connector is not None:
         # close after loop is dead
         session._connector._close()
Exemple #23
0
def test_checksum(s3):
    bucket = test_bucket_name
    d = "checksum"
    prefix = d+"/e"
    o1 = prefix + "1"
    o2 = prefix + "2"
    path1 = bucket + "/" + o1
    path2 = bucket + "/" + o2

    client = s3.s3

    # init client and files
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="")
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o2, Body="")

    # change one file, using cache
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo")
    checksum = s3.checksum(path1)
    s3.ls(path1) # force caching
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar")
    # refresh == False => checksum doesn't change
    assert checksum == s3.checksum(path1)

    # change one file, without cache
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo")
    checksum = s3.checksum(path1, refresh=True)
    s3.ls(path1) # force caching
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar")
    # refresh == True => checksum changes
    assert checksum != s3.checksum(path1, refresh=True)


    # Test for nonexistent file
    sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar")
    s3.ls(path1) # force caching
    sync(s3.loop, client.delete_object, Bucket=bucket, Key=o1)
    with pytest.raises(FileNotFoundError):
        s3.checksum(o1, refresh=True)
    
    # Test multipart upload
    upload_id = sync(s3.loop, client.create_multipart_upload,
        Bucket=bucket,
        Key=o1,
    )["UploadId"]
    etag1 = sync(s3.loop, client.upload_part,
        Bucket=bucket,
        Key=o1,
        UploadId=upload_id,
        PartNumber=1,
        Body="0" * (5 * 1024 * 1024),
    )['ETag']
    etag2 = sync(s3.loop, client.upload_part,
        Bucket=bucket,
        Key=o1,
        UploadId=upload_id,
        PartNumber=2,
        Body="0",
    )['ETag']
    sync(s3.loop, client.complete_multipart_upload,
        Bucket=bucket,
        Key=o1,
        UploadId=upload_id,
        MultipartUpload={'Parts': [
            {'PartNumber': 1, 'ETag': etag1},
            {'PartNumber': 2, 'ETag': etag2},
        ]},
    )
    s3.checksum(path1, refresh=True)