def _parse_wiki_sql_dump(wiki_sql_dump_url, parse_fx, **kwargs):
    _kwargs = {**config, **kwargs}
    dumps_path = _kwargs["dumps_path"]
    max_workers = _kwargs["max_workers"]
    verbose = _kwargs["verbose"]
    compress_bytes_read = 0
    dump_name = wiki_sql_dump_url.name
    msg.text(f"-> {dump_name}", show=verbose)
    tqdm_disable = not verbose
    tqdm_kwargs = {
        "unit": "B",
        "unit_scale": True,
        "unit_divisor": 1024,
        "disable": tqdm_disable,
    }
    compress_obj, content_len = _get_wiki_dump_obj(wiki_sql_dump_url, verbose)
    should_reopen_compress_obj = False
    if dumps_path is not None:
        if not dumps_path.exists():
            dumps_path.mkdir()
        dump_filepath = dumps_path.joinpath(dump_name)
        if not dump_filepath.exists() or dump_filepath.stat().st_size == 0:
            with tqdm(
                desc="download to disk",
                total=content_len,
                **tqdm_kwargs,
            ) as pbar, dump_filepath.open("wb") as fd:
                bytes_read = 0
                for chunk in compress_obj:
                    fd.write(chunk)
                    compress_bytes = compress_obj.tell()
                    pbar.update(compress_bytes - bytes_read)
                    bytes_read = compress_bytes
            compress_obj.close()
            should_reopen_compress_obj = True
            wiki_sql_dump_url = dump_filepath
    if should_reopen_compress_obj:
        compress_obj, content_len = _get_wiki_dump_obj(wiki_sql_dump_url)
    with tqdm(
        desc="parse",
        total=content_len,
        **tqdm_kwargs,
    ) as pbar, compression_wrapper(compress_obj, "rb") as decompress_obj:
        compress_bytes_read = 0
        with closing(Pool(max_workers)) as pool:
            task = partial(_parsing_task, parse_fx=parse_fx)
            for res in pool.imap_unordered(task, decompress_obj, chunksize=10):
                compress_bytes = compress_obj.tell()
                pbar.update(compress_bytes - compress_bytes_read)
                compress_bytes_read = compress_bytes
                yield from pickle_loads(zlib.decompress(res))
    msg.good(dump_name, show=verbose)
Beispiel #2
0
def open(
    uri,
    mode='r',
    buffering=-1,
    encoding=None,
    errors=None,
    newline=None,
    closefd=True,
    opener=None,
    ignore_ext=False,
    transport_params=None,
):
    r"""Open the URI object, returning a file-like object.

    The URI is usually a string in a variety of formats.
    For a full list of examples, see the :func:`parse_uri` function.

    The URI may also be one of:

    - an instance of the pathlib.Path class
    - a stream (anything that implements io.IOBase-like functionality)

    Parameters
    ----------
    uri: str or object
        The object to open.
    mode: str, optional
        Mimicks built-in open parameter of the same name.
    buffering: int, optional
        Mimicks built-in open parameter of the same name.
    encoding: str, optional
        Mimicks built-in open parameter of the same name.
    errors: str, optional
        Mimicks built-in open parameter of the same name.
    newline: str, optional
        Mimicks built-in open parameter of the same name.
    closefd: boolean, optional
        Mimicks built-in open parameter of the same name.  Ignored.
    opener: object, optional
        Mimicks built-in open parameter of the same name.  Ignored.
    ignore_ext: boolean, optional
        Disable transparent compression/decompression based on the file extension.
    transport_params: dict, optional
        Additional parameters for the transport layer (see notes below).

    Returns
    -------
    A file-like object.

    Notes
    -----
    smart_open has several implementations for its transport layer (e.g. S3, HTTP).
    Each transport layer has a different set of keyword arguments for overriding
    default behavior.  If you specify a keyword argument that is *not* supported
    by the transport layer being used, smart_open will ignore that argument and
    log a warning message.

    smart_open/doctools.py magic goes here

    See Also
    --------
    - `Standard library reference <https://docs.python.org/3.7/library/functions.html#open>`__
    - `smart_open README.rst
      <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst>`__

    """
    logger.debug('%r', locals())

    if not isinstance(mode, str):
        raise TypeError('mode should be a string')

    if transport_params is None:
        transport_params = {}

    fobj = _shortcut_open(
        uri,
        mode,
        ignore_ext=ignore_ext,
        buffering=buffering,
        encoding=encoding,
        errors=errors,
    )
    if fobj is not None:
        return fobj

    #
    # This is a work-around for the problem described in Issue #144.
    # If the user has explicitly specified an encoding, then assume they want
    # us to open the destination in text mode, instead of the default binary.
    #
    # If we change the default mode to be text, and match the normal behavior
    # of Py2 and 3, then the above assumption will be unnecessary.
    #
    if encoding is not None and 'b' in mode:
        mode = mode.replace('b', '')

    if isinstance(uri, pathlib.Path):
        uri = str(uri)

    explicit_encoding = encoding
    encoding = explicit_encoding if explicit_encoding else SYSTEM_ENCODING

    #
    # This is how we get from the filename to the end result.  Decompression is
    # optional, but it always accepts bytes and returns bytes.
    #
    # Decoding is also optional, accepts bytes and returns text.  The diagram
    # below is for reading, for writing, the flow is from right to left, but
    # the code is identical.
    #
    #           open as binary         decompress?          decode?
    # filename ---------------> bytes -------------> bytes ---------> text
    #                          binary             decompressed       decode
    #
    binary_mode = _TO_BINARY_LUT.get(mode, mode)
    binary = _open_binary_stream(uri, binary_mode, transport_params)
    if ignore_ext:
        decompressed = binary
    else:
        decompressed = compression.compression_wrapper(binary, mode)

    if 'b' not in mode or explicit_encoding is not None:
        decoded = _encoding_wrapper(decompressed,
                                    mode,
                                    encoding=encoding,
                                    errors=errors)
    else:
        decoded = decompressed

    return decoded
Beispiel #3
0
def open(
    uri,
    mode='r',
    buffering=-1,
    encoding=None,
    errors=None,
    newline=None,
    closefd=True,
    opener=None,
    compression=so_compression.INFER_FROM_EXTENSION,
    transport_params=None,
):
    r"""Open the URI object, returning a file-like object.

    The URI is usually a string in a variety of formats.
    For a full list of examples, see the :func:`parse_uri` function.

    The URI may also be one of:

    - an instance of the pathlib.Path class
    - a stream (anything that implements io.IOBase-like functionality)

    Parameters
    ----------
    uri: str or object
        The object to open.
    mode: str, optional
        Mimicks built-in open parameter of the same name.
    buffering: int, optional
        Mimicks built-in open parameter of the same name.
    encoding: str, optional
        Mimicks built-in open parameter of the same name.
    errors: str, optional
        Mimicks built-in open parameter of the same name.
    newline: str, optional
        Mimicks built-in open parameter of the same name.
    closefd: boolean, optional
        Mimicks built-in open parameter of the same name.  Ignored.
    opener: object, optional
        Mimicks built-in open parameter of the same name.  Ignored.
    compression: str, optional (see smart_open.compression.get_supported_compression_types)
        Explicitly specify the compression/decompression behavior.
    transport_params: dict, optional
        Additional parameters for the transport layer (see notes below).

    Returns
    -------
    A file-like object.

    Notes
    -----
    smart_open has several implementations for its transport layer (e.g. S3, HTTP).
    Each transport layer has a different set of keyword arguments for overriding
    default behavior.  If you specify a keyword argument that is *not* supported
    by the transport layer being used, smart_open will ignore that argument and
    log a warning message.

    smart_open/doctools.py magic goes here

    See Also
    --------
    - `Standard library reference <https://docs.python.org/3.7/library/functions.html#open>`__
    - `smart_open README.rst
      <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst>`__

    """
    logger.debug('%r', locals())

    if not isinstance(mode, str):
        raise TypeError('mode should be a string')

    if compression not in so_compression.get_supported_compression_types():
        raise ValueError(f'invalid compression type: {compression}')

    if transport_params is None:
        transport_params = {}

    fobj = _shortcut_open(
        uri,
        mode,
        compression=compression,
        buffering=buffering,
        encoding=encoding,
        errors=errors,
        newline=newline,
    )
    if fobj is not None:
        return fobj

    #
    # This is a work-around for the problem described in Issue #144.
    # If the user has explicitly specified an encoding, then assume they want
    # us to open the destination in text mode, instead of the default binary.
    #
    # If we change the default mode to be text, and match the normal behavior
    # of Py2 and 3, then the above assumption will be unnecessary.
    #
    if encoding is not None and 'b' in mode:
        mode = mode.replace('b', '')

    if isinstance(uri, pathlib.Path):
        uri = str(uri)

    explicit_encoding = encoding
    encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING

    #
    # This is how we get from the filename to the end result.  Decompression is
    # optional, but it always accepts bytes and returns bytes.
    #
    # Decoding is also optional, accepts bytes and returns text.  The diagram
    # below is for reading, for writing, the flow is from right to left, but
    # the code is identical.
    #
    #           open as binary         decompress?          decode?
    # filename ---------------> bytes -------------> bytes ---------> text
    #                          binary             decompressed       decode
    #

    try:
        binary_mode = _get_binary_mode(mode)
    except ValueError as ve:
        raise NotImplementedError(ve.args[0])

    binary = _open_binary_stream(uri, binary_mode, transport_params)
    decompressed = so_compression.compression_wrapper(binary, binary_mode,
                                                      compression)

    if 'b' not in mode or explicit_encoding is not None:
        decoded = _encoding_wrapper(
            decompressed,
            mode,
            encoding=encoding,
            errors=errors,
            newline=newline,
        )
    else:
        decoded = decompressed

    #
    # There are some useful methods in the binary readers, e.g. to_boto3, that get
    # hidden by the multiple layers of wrapping we just performed.  Promote
    # them so they are visible to the user.
    #
    if decoded != binary:
        promoted_attrs = ['to_boto3']
        for attr in promoted_attrs:
            try:
                setattr(decoded, attr, getattr(binary, attr))
            except AttributeError:
                pass

    return decoded