Exemple #1
0
def open(
    bucket_id,
    key_id,
    mode,
    version_id=None,
    buffer_size=DEFAULT_BUFFER_SIZE,
    min_part_size=DEFAULT_MIN_PART_SIZE,
    multipart_upload=True,
    session=None,
    defer_seek=False,
    client=None,
    client_kwargs=None,
    writebuffer=None,
):
    return s3.open(
        bucket_id,
        key_id,
        mode,
        version_id,
        buffer_size,
        min_part_size,
        multipart_upload_kwargs=None,
        multipart_upload=multipart_upload,
        session=session,
        resource_kwargs={"endpoint_url": settings.WASABI_S3_ENDPOINT_URL},
        singlepart_upload_kwargs=None,
        object_kwargs=None,
    )
Exemple #2
0
def s3_open_key(key, mode, **kwargs):
    logger.debug('%r', locals())
    #
    # TODO: handle boto3 keys as well
    #
    host = kwargs.pop('host', None)
    if host is not None:
        kwargs['endpoint_url'] = 'http://' + host

    if kwargs.pop("ignore_extension", False):
        codec = None
    else:
        codec = _detect_codec(key.name)

    #
    # Codecs work on a byte-level, so the underlying S3 object should
    # always be reading bytes.
    #
    if mode in (smart_open_s3.READ, smart_open_s3.READ_BINARY):
        s3_mode = smart_open_s3.READ_BINARY
    elif mode in (smart_open_s3.WRITE, smart_open_s3.WRITE_BINARY):
        s3_mode = smart_open_s3.WRITE_BINARY
    else:
        raise NotImplementedError('mode %r not implemented for S3' % mode)

    logging.debug('codec: %r mode: %r s3_mode: %r', codec, mode, s3_mode)
    encoding = kwargs.get('encoding')
    errors = kwargs.get('errors', DEFAULT_ERRORS)
    fobj = smart_open_s3.open(key.bucket.name, key.name, s3_mode, **kwargs)
    decompressed_fobj = _CODECS[codec](fobj, mode)
    decoded_fobj = encoding_wrapper(decompressed_fobj,
                                    mode,
                                    encoding=encoding,
                                    errors=errors)
    return decoded_fobj
def _s3_open_uri(parsed_uri, mode, transport_params):
    logger.debug('s3_open_uri: %r', locals())
    if mode in ('r', 'w'):
        raise ValueError('this function can only open binary streams. '
                         'Use smart_open.smart_open() to open text streams.')
    elif mode not in ('rb', 'wb'):
        raise NotImplementedError('unsupported mode: %r', mode)

    #
    # There are two explicit ways we can receive session parameters from the user.
    #
    # 1. Via the session keyword argument (transport_params)
    # 2. Via the URI itself
    #
    # They are not mutually exclusive, but we have to pick one of the two.
    # Go with 1).
    #
    if transport_params.get('session') is not None and (
            parsed_uri.access_id or parsed_uri.access_secret):
        logger.warning(
            'ignoring credentials parsed from URL because they conflict with '
            'transport_params.session. Set transport_params.session to None '
            'to suppress this warning.')
    elif (parsed_uri.access_id and parsed_uri.access_secret):
        transport_params['session'] = boto3.Session(
            aws_access_key_id=parsed_uri.access_id,
            aws_secret_access_key=parsed_uri.access_secret,
        )

    kwargs = _check_kwargs(smart_open_s3.open, transport_params)
    return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode,
                              **kwargs)
Exemple #4
0
def s3_open_key(key, mode, **kwargs):
    logger.debug('%r', locals())
    #
    # TODO: handle boto3 keys as well
    #
    host = kwargs.pop('host', None)
    if host is not None:
        kwargs['endpoint_url'] = 'http://' + host

    if kwargs.pop("ignore_extension", False):
        codec = None
    else:
        codec = _detect_codec(key.name)

    #
    # Codecs work on a byte-level, so the underlying S3 object should
    # always be reading bytes.
    #
    if codec and mode in (smart_open_s3.READ, smart_open_s3.READ_BINARY):
        s3_mode = smart_open_s3.READ_BINARY
    elif codec and mode in (smart_open_s3.WRITE, smart_open_s3.WRITE_BINARY):
        s3_mode = smart_open_s3.WRITE_BINARY
    else:
        s3_mode = mode

    logging.debug('codec: %r mode: %r s3_mode: %r', codec, mode, s3_mode)
    fobj = smart_open_s3.open(key.bucket.name, key.name, s3_mode, **kwargs)
    return _CODECS[codec](fobj, mode)
Exemple #5
0
def s3_open_uri(parsed_uri, mode, **kwargs):
    logger.debug('%r', locals())
    if parsed_uri.access_id is not None:
        kwargs['aws_access_key_id'] = parsed_uri.access_id
    if parsed_uri.access_secret is not None:
        kwargs['aws_secret_access_key'] = parsed_uri.access_secret

    # Get an S3 host. It is required for sigv4 operations.
    host = kwargs.pop('host', None)
    if host is not None:
        kwargs['endpoint_url'] = 'http://' + host

    #
    # TODO: this is the wrong place to handle ignore_extension.
    # It should happen at the highest level in the smart_open function, because
    # it influences other file systems as well, not just S3.
    #
    if kwargs.pop("ignore_extension", False):
        codec = None
    else:
        codec = _detect_codec(parsed_uri.key_id)

    #
    # Codecs work on a byte-level, so the underlying S3 object should
    # always be reading bytes.
    #
    if mode in (smart_open_s3.READ, smart_open_s3.READ_BINARY):
        s3_mode = smart_open_s3.READ_BINARY
    elif mode in (smart_open_s3.WRITE, smart_open_s3.WRITE_BINARY):
        s3_mode = smart_open_s3.WRITE_BINARY
    else:
        raise NotImplementedError('mode %r not implemented for S3' % mode)

    #
    # TODO: I'm not sure how to handle this with boto3.  Any ideas?
    #
    # https://github.com/boto/boto3/issues/334
    #
    # _setup_unsecured_mode()

    encoding = kwargs.get('encoding')
    errors = kwargs.get('errors', DEFAULT_ERRORS)
    fobj = smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, s3_mode,
                              **kwargs)
    decompressed_fobj = _CODECS[codec](fobj, mode)
    decoded_fobj = encoding_wrapper(decompressed_fobj,
                                    mode,
                                    encoding=encoding,
                                    errors=errors)
    return decoded_fobj
Exemple #6
0
def _s3_open_uri(parsed_uri, mode, transport_params):
    logger.debug('s3_open_uri: %r', locals())
    if mode in ('r', 'w'):
        raise ValueError('this function can only open binary streams. '
                         'Use smart_open.smart_open() to open text streams.')
    elif mode not in ('rb', 'wb'):
        raise NotImplementedError('unsupported mode: %r', mode)

    #
    # There are two explicit ways we can receive session parameters from the user.
    #
    # 1. Via the session keyword argument (transport_params)
    # 2. Via the URI itself
    #
    # They are not mutually exclusive, but we have to pick one of the two.
    # Go with 1).
    #
    if transport_params.get('session') is not None and (
            parsed_uri.access_id or parsed_uri.access_secret):
        logger.warning(
            'ignoring credentials parsed from URL because they conflict with '
            'transport_params.session. Set transport_params.session to None '
            'to suppress this warning.')
    elif (parsed_uri.access_id and parsed_uri.access_secret):
        transport_params['session'] = boto3.Session(
            aws_access_key_id=parsed_uri.access_id,
            aws_secret_access_key=parsed_uri.access_secret,
        )

    #
    # There are two explicit ways the user can provide the endpoint URI:
    #
    # 1. Via the URL.  The protocol is implicit, and we assume HTTPS in this case.
    # 2. Via the resource_kwargs and multipart_upload_kwargs endpoint_url parameter.
    #
    # Again, these are not mutually exclusive: the user can specify both.  We
    # have to pick one to proceed, however, and we go with 2.
    #
    if parsed_uri.host != _DEFAULT_S3_HOST:
        endpoint_url = 'https://%s:%d' % (parsed_uri.host, parsed_uri.port)
        _override_endpoint_url(transport_params, endpoint_url)

    kwargs = _check_kwargs(smart_open_s3.open, transport_params)
    return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode,
                              **kwargs)
def _s3_open_uri(parsed_uri, mode, **kwargs):
    logger.debug('s3_open_uri: %r', locals())
    if mode in ('r', 'w'):
        raise ValueError('this function can only open binary streams. '
                         'Use smart_open.smart_open() to open text streams.')
    elif mode not in ('rb', 'wb'):
        raise NotImplementedError('unsupported mode: %r', mode)
    if parsed_uri.access_id is not None:
        kwargs['aws_access_key_id'] = parsed_uri.access_id
    if parsed_uri.access_secret is not None:
        kwargs['aws_secret_access_key'] = parsed_uri.access_secret

    # Get an S3 host. It is required for sigv4 operations.
    host = kwargs.pop('host', None)
    if host is not None:
        kwargs['endpoint_url'] = _add_scheme_to_host(host)

    return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kwargs)
def _s3_open_uri(parsed_uri, mode, **kwargs):
    logger.debug('s3_open_uri: %r', locals())
    if mode in ('r', 'w'):
        raise ValueError('this function can only open binary streams. '
                         'Use smart_open.smart_open() to open text streams.')
    elif mode not in ('rb', 'wb'):
        raise NotImplementedError('unsupported mode: %r', mode)
    if parsed_uri.access_id is not None:
        kwargs['aws_access_key_id'] = parsed_uri.access_id
    if parsed_uri.access_secret is not None:
        kwargs['aws_secret_access_key'] = parsed_uri.access_secret

    # Get an S3 host. It is required for sigv4 operations.
    host = kwargs.pop('host', None)
    if host is not None:
        kwargs['endpoint_url'] = 'http://' + host

    return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kwargs)
def s3_open_uri(parsed_uri, mode, ignore_extension=False, **kwargs):
    logger.debug('%r', locals())
    if parsed_uri.access_id is not None:
        kwargs['aws_access_key_id'] = parsed_uri.access_id
    if parsed_uri.access_secret is not None:
        kwargs['aws_secret_access_key'] = parsed_uri.access_secret

    # Get an S3 host. It is required for sigv4 operations.
    host = kwargs.pop('host', None)
    if host is not None:
        kwargs['endpoint_url'] = 'http://' + host

    #
    # Codecs work on a byte-level, so the underlying S3 object should
    # always be reading bytes.
    #
    if mode in (smart_open_s3.READ, smart_open_s3.READ_BINARY):
        s3_mode = smart_open_s3.READ_BINARY
    elif mode in (smart_open_s3.WRITE, smart_open_s3.WRITE_BINARY):
        s3_mode = smart_open_s3.WRITE_BINARY
    else:
        raise NotImplementedError('mode %r not implemented for S3' % mode)

    #
    # TODO: I'm not sure how to handle this with boto3.  Any ideas?
    #
    # https://github.com/boto/boto3/issues/334
    #
    # _setup_unsecured_mode()

    encoding = kwargs.get('encoding')
    errors = kwargs.get('errors', DEFAULT_ERRORS)
    fobj = smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, s3_mode,
                              **kwargs)
    decompressed_fobj = compression_wrapper(fobj, parsed_uri.key_id, mode,
                                            ignore_extension)
    decoded_fobj = encoding_wrapper(decompressed_fobj,
                                    mode,
                                    encoding=encoding,
                                    errors=errors)
    return decoded_fobj
Exemple #10
0
def _open_binary_stream(uri, mode, **kw):
    """Open an arbitrary URI in the specified binary mode.

    Not all modes are supported for all protocols.

    :arg uri: The URI to open.  May be a string, or something else.
    :arg str mode: The mode to open with.  Must be rb, wb or ab.
    :arg kw: TODO: document this.
    :returns: A file object and the filename
    :rtype: tuple
    """
    if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'):
        #
        # This should really be a ValueError, but for the sake of compatibility
        # with older versions, which raise NotImplementedError, we do the same.
        #
        raise NotImplementedError('unsupported mode: %r' % mode)

    if isinstance(uri, six.string_types):
        # this method just routes the request to classes handling the specific storage
        # schemes, depending on the URI protocol in `uri`
        filename = uri.split('/')[-1]
        parsed_uri = _parse_uri(uri)
        unsupported = "%r mode not supported for %r scheme" % (mode, parsed_uri.scheme)

        if parsed_uri.scheme in ("file", ):
            # local files -- both read & write supported
            # compression, if any, is determined by the filename extension (.gz, .bz2, .xz)
            fobj = io.open(parsed_uri.uri_path, mode)
            return fobj, filename
        elif parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES:
            return _s3_open_uri(parsed_uri, mode, **kw), filename
        elif parsed_uri.scheme in ("hdfs", ):
            if mode == 'rb':
                return smart_open_hdfs.CliRawInputBase(parsed_uri.uri_path), filename
            elif mode == 'wb':
                return smart_open_hdfs.CliRawOutputBase(parsed_uri.uri_path), filename
            else:
                raise NotImplementedError(unsupported)
        elif parsed_uri.scheme in ("webhdfs", ):
            if mode == 'rb':
                fobj = smart_open_webhdfs.BufferedInputBase(parsed_uri.uri_path, **kw)
            elif mode == 'wb':
                fobj = smart_open_webhdfs.BufferedOutputBase(parsed_uri.uri_path, **kw)
            else:
                raise NotImplementedError(unsupported)
            return fobj, filename
        elif parsed_uri.scheme.startswith('http'):
            #
            # The URI may contain a query string and fragments, which interfere
            # with out compressed/uncompressed estimation.
            #
            filename = P.basename(urlparse.urlparse(uri).path)
            if mode == 'rb':
                return smart_open_http.BufferedInputBase(uri, **kw), filename
            else:
                raise NotImplementedError(unsupported)
        else:
            raise NotImplementedError("scheme %r is not supported", parsed_uri.scheme)
    elif isinstance(uri, boto.s3.key.Key):
        logger.debug('%r', locals())
        #
        # TODO: handle boto3 keys as well
        #
        host = kw.pop('host', None)
        if host is not None:
            kw['endpoint_url'] = _add_scheme_to_host(host)
        return smart_open_s3.open(uri.bucket.name, uri.name, mode, **kw), uri.name
    elif hasattr(uri, 'read'):
        # simply pass-through if already a file-like
        filename = '/tmp/unknown'
        return uri, filename
    else:
        raise TypeError('don\'t know how to handle uri %s' % repr(uri))
def _open_binary_stream(uri, mode, **kw):
    """Open an arbitrary URI in the specified binary mode.

    Not all modes are supported for all protocols.

    :arg uri: The URI to open.  May be a string, or something else.
    :arg str mode: The mode to open with.  Must be rb, wb or ab.
    :arg kw: TODO: document this.
    :returns: A file object and the filename
    :rtype: tuple
    """
    if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'):
        #
        # This should really be a ValueError, but for the sake of compatibility
        # with older versions, which raise NotImplementedError, we do the same.
        #
        raise NotImplementedError('unsupported mode: %r' % mode)

    if isinstance(uri, six.string_types):
        # this method just routes the request to classes handling the specific storage
        # schemes, depending on the URI protocol in `uri`
        filename = uri.split('/')[-1]
        parsed_uri = _parse_uri(uri)
        unsupported = "%r mode not supported for %r scheme" % (mode, parsed_uri.scheme)

        if parsed_uri.scheme in ("file", ):
            # local files -- both read & write supported
            # compression, if any, is determined by the filename extension (.gz, .bz2)
            fobj = io.open(parsed_uri.uri_path, mode)
            return fobj, filename
        elif parsed_uri.scheme in ("s3", "s3n", 's3u'):
            return _s3_open_uri(parsed_uri, mode, **kw), filename
        elif parsed_uri.scheme in ("hdfs", ):
            if mode == 'rb':
                return smart_open_hdfs.CliRawInputBase(parsed_uri.uri_path), filename
            elif mode == 'wb':
                return smart_open_hdfs.CliRawOutputBase(parsed_uri.uri_path), filename
            else:
                raise NotImplementedError(unsupported)
        elif parsed_uri.scheme in ("webhdfs", ):
            if mode == 'rb':
                fobj = smart_open_webhdfs.BufferedInputBase(parsed_uri.uri_path, **kw)
            elif mode == 'wb':
                fobj = smart_open_webhdfs.BufferedOutputBase(parsed_uri.uri_path, **kw)
            else:
                raise NotImplementedError(unsupported)
            return fobj, filename
        elif parsed_uri.scheme.startswith('http'):
            #
            # The URI may contain a query string and fragments, which interfere
            # with out compressed/uncompressed estimation.
            #
            filename = P.basename(urlparse.urlparse(uri).path)
            if mode == 'rb':
                return smart_open_http.BufferedInputBase(uri, **kw), filename
            else:
                raise NotImplementedError(unsupported)
        else:
            raise NotImplementedError("scheme %r is not supported", parsed_uri.scheme)
    elif isinstance(uri, boto.s3.key.Key):
        logger.debug('%r', locals())
        #
        # TODO: handle boto3 keys as well
        #
        host = kw.pop('host', None)
        if host is not None:
            kw['endpoint_url'] = 'http://' + host
        return smart_open_s3.open(uri.bucket.name, uri.name, mode, **kw), uri.name
    elif hasattr(uri, 'read'):
        # simply pass-through if already a file-like
        filename = '/tmp/unknown'
        return uri, filename
    else:
        raise TypeError('don\'t know how to handle uri %s' % repr(uri))