Exemple #1
0
    def _generate_examples(self, files):
        """Yield examples as (key, example) tuples."""
        key = 0
        if isinstance(files, list):
            import zstandard as zstd

            for path in files:
                with zstd.open(open(path, "rb"), "rt", encoding="utf-8") as f:
                    for row in f:
                        data = json.loads(row)
                        yield key, data
                        key += 1
        else:
            for subset in files:
                if subset == "free_law":
                    import zstandard as zstd

                    with zstd.open(open(files[subset], "rb"),
                                   "rt",
                                   encoding="utf-8") as f:
                        for row in f:
                            data = json.loads(row)
                            yield key, data
                            key += 1
                elif subset == "pubmed_central":
                    for path, file in files[subset]:
                        id_ = path.split("/")[-1].split(".")[0]
                        text = file.read().decode("utf-8")
                        yield key, {
                            "id": id_,
                            "text": text,
                        }
                        key += 1
Exemple #2
0
def extract_word_frequency(json_file: Path) -> None:
    title_counter: Counter[str] = Counter()
    body_counter: Counter[str] = Counter()
    with zst.open(json_file, mode="r", newline="\n") as f:
        for i, line in enumerate(zst.open(json_file, mode="r", newline="\n")):
            if i % 100000 == 0:
                print(f"> {i} {json_file} {len(title_counter)} {len(body_counter)}")
            title, score, body = json.loads(line)
            if title is None or body is None:
                continue
            title_counter.update(strip_word(w) for w in title.split())
            body_counter.update(strip_word(w) for w in strip_body(body).split())
    print(f"> done {json_file} {len(title_counter)} {len(body_counter)}")
Exemple #3
0
def _repack(
    fin: IO[bytes],
    fout: IO[bytes],
    index_fout: IO[bytes],
    chunk_size: int,
    extractor: Callable,
    output_compression: str = Compression.GZIP.value,
) -> None:
    start_offset, end_offset = 0, 0
    compressed_chunk = None
    compression = _determine_compression_from_header(fin)
    assert compression in SUPPORTED_COMPRESSIONS
    assert output_compression in SUPPORTED_COMPRESSIONS

    with _open_compressed_file(fin, compression, mode='rb') as fin:
        for batch in _batch_iterator(fin, decode_lines=False, batch_size=chunk_size):
            keys = []
            line_indexes = []
            chunk = io.BytesIO()
            compressed_chunk = _open_compressed_file(chunk, output_compression, mode='wb')

            line_start, line_end = 0, 0
            for line in batch:
                key = extractor(line)
                keys.append(key)
                line_start = line_end
                line_end = line_start + len(line)
                line_indexes.append('|%s|%s' % (line_start, line_end - line_start))
                compressed_chunk.write(line)

            compressed_chunk.close()
            fout.write(chunk.getvalue())
            fout.flush()

            start_offset = end_offset
            end_offset = start_offset + chunk.getbuffer().nbytes
            for i, key in enumerate(keys):
                index = '%s|%s|%s' % (key, start_offset, end_offset - start_offset)
                index += line_indexes[i]
                index_fout.write(index.encode(_TEXT_ENCODING) + _LINE_TERMINATOR)
            index_fout.flush()

    if compressed_chunk is None:
        #
        # The input file contained no data.  We must write an empty gzip chunk
        # to make sure the output file is gzip-readable.
        #
        fout = io.BytesIO()
        zstandard.open(fout, mode='wb').write(b'').close()
        fout.write(fout.getvalue())
        fout.flush()
Exemple #4
0
def cached(func: Callable[[],Any], filename: pathlib.Path, rerun: Optional[bool] = False):
    global RERUN_ALL_CACHED
    ext = '.pkl.zstd'
    filename = filename.with_suffix(ext)
    if not filename.exists() or rerun or RERUN_ALL_CACHED:
        result = func()
        if result is None:
            raise ValueError('The function you passed did not return anything')
        with zstandard.open(filename, 'wb') as outfile:
            pickle.dump(result, outfile)
    else:
        with zstandard.open(filename, 'rb') as infile:
            result = pickle.load(infile)
    return result
    def test_write_text_filename(self):
        with tempfile.TemporaryDirectory() as td:
            p = os.path.join(td, "testfile")

            fh = zstd.open(p, "w")
            self.assertIsInstance(fh, io.TextIOWrapper)

            fh.write("foo\n")
            fh.write("bar\n")
            fh.close()
            self.assertTrue(fh.closed)

            with zstd.open(p, "r") as fh:
                self.assertEqual(fh.read(), "foo\nbar\n")
Exemple #6
0
def _open_compressed_file(
    path: Union[str, IO],
    compression: Optional[str],
    mode: str = 'r'
) -> IO:
    """Open the specified file for reading/writing.

    Transparently opens compressed (``.gz``, ``.zst``) files.
    """
    if 'b' not in mode and 't' not in mode:
        mode += 't'
    encoding = None if 'b' in mode else _TEXT_ENCODING
    if compression == Compression.GZIP.value:
        return cast(IO, gzip.open(path, mode, encoding=encoding))
    elif compression == Compression.ZSTD.value:
        #
        # zstandard does not support some operations for binary data (e.g. readline)
        # https://github.com/indygreg/python-zstandard/issues/136
        #
        reader = zstandard.open(path, mode, encoding=encoding)  # type: ignore
        if 'b' in mode:
            reader = io.BufferedReader(reader) if 'r' in mode else io.BufferedWriter(reader)
        return cast(IO, reader)
    else:
        raise ValueError("Unsupported compression format: %r" % compression)
Exemple #7
0
    def zstdopen(cls,
                 name,
                 mode="r",
                 fileobj=None,
                 cctx=None,
                 dctx=None,
                 **kwargs):  # type: ignore
        """Open zstd compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if mode not in ("r"):
            raise ValueError("mode must be 'r'")

        try:
            zobj = zstandard.open(fileobj or name,
                                  mode + "b",
                                  cctx=cctx,
                                  dctx=dctx)
            with zobj:
                data = zobj.read()
        except (zstandard.ZstdError, EOFError) as e:
            raise tarfile.ReadError("not a zstd file") from e

        fileobj = io.BytesIO(data)
        t = cls.taropen(name, mode, fileobj, **kwargs)
        t._extfileobj = False
        return t
Exemple #8
0
def zstd(stream):
    """
    Read zstandard compressed files
    """
    import zstandard  # type:ignore

    with zstandard.open(stream, "rb") as file:  # type:ignore
        yield from file.read().split(b"\n")[:-1]
    def _generate_examples(self, files):
        """Yield examples as (key, example) tuples."""
        key = 0
        if isinstance(files, list):
            import zstandard as zstd

            for path in files:
                with zstd.open(open(path, "rb"), "rt", encoding="utf-8") as f:
                    for row in f:
                        data = json.loads(row)
                        yield key, data
                        key += 1
        else:
            for subset in files:
                if subset in {
                        "enron_emails", "europarl", "free_law", "nih_exporter",
                        "pubmed", "ubuntu_irc"
                }:
                    import zstandard as zstd

                    with zstd.open(open(files[subset], "rb"),
                                   "rt",
                                   encoding="utf-8") as f:
                        for row in f:
                            data = json.loads(row)
                            yield key, data
                            key += 1
                elif subset in {"hacker_news", "pubmed_central"}:
                    for path, file in files[subset]:
                        id_ = path.split("/")[-1].split(".")[0]
                        meta = {"id": id_}
                        text = file.read().decode("utf-8")
                        yield key, {
                            "text": text,
                            "meta": meta,
                        }
                        key += 1
                elif subset == "uspto":
                    import zstandard as zstd

                    for path, file in files[subset]:
                        with zstd.open(file, "rt", encoding="utf-8") as f:
                            for row in f:
                                data = json.loads(row)
                                yield key, data
                                key += 1
def zstd_file(tmp_path_factory):
    if config.ZSTANDARD_AVAILABLE:
        import zstandard as zstd

        path = tmp_path_factory.mktemp("data") / "file.txt.zst"
        data = bytes(FILE_CONTENT, "utf-8")
        with zstd.open(path, "wb") as f:
            f.write(data)
        return path
Exemple #11
0
def wiktionary_wordlist(path: Path) -> Wordlist:
    wordlist = Wordlist()
    with zst.open(path, mode="r", newline="\n") as f:
        for line in f:
            data = json.loads(line.strip())
            wordlist.add_word(data["word"], score=1000)
            for form in data.get("forms", []):
                wordlist.add_word(form["form"], score=100)
    return wordlist
    def test_write_binary_filename(self):
        with tempfile.TemporaryDirectory() as td:
            p = os.path.join(td, "testfile")

            fh = zstd.open(p, "wb")
            fh.write(b"foo" * 1024)
            self.assertFalse(fh.closed)

            fh.close()
            self.assertTrue(fh.closed)
Exemple #13
0
def read_df(path, fmt="csv", reader_args=[], reader_options={}, open_kw={}):
    """Read DataFrame.

    Args:
        path (str): The path to read from. Can be anything, which `smart_open` supports, like `s3://bucket/file`.
            Compression type is inferred 

    Kwargs:
        fmt (str): The format to read. Should work with most of Pandas `read_*` methods.
        reader_args (list): Argument list for the Pandas `read_$fmt` method.
        reader_options (dict): Keyword arguments for the Pandas `read_$fmt` method.
        open_kw (dict): Keyword arguments for `smart_open`.
    Returns:
        The read Pandas DataFrame.
    """
    reader_defaults = {
        "csv": {
            "encoding": "UTF_8"
        },
        "json": {
            "orient": "records",
            "lines": True
        }
    }
    if not reader_options:
        reader_options = reader_defaults.get(fmt, {})
    pd_reader = getattr(pd, "read_{}".format(fmt))

    # pandas could read from S3 and even open some compressed formats, but
    # for testing (localstack) and consistency, we handle all cases the same:
    # we open the path with smart_open and stack a decompressor onto it
    with open(path, "rb", compression="disable", **open_kw) as _r:
        if path.endswith(".zstd") or path.endswith(".zst"):
            with zstandard.open(_r) as zs:
                # these readers try to seek, which is not supported
                # by the decompressor, so open a temporary file, uncompress data
                # to it and use that for reading with pandas
                if fmt in ["parquet", "feather"]:
                    with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
                        shutil.copyfileobj(zs, tmpfile)
                        tmpfile.flush()
                        tmpfile.seek(0)
                        return pd_reader(tmpfile, *reader_args,
                                         **reader_options)
                else:
                    return pd_reader(zs, *reader_args, **reader_options)
        elif path.endswith(".gz"):
            with gzip.GzipFile(fileobj=_r) as gz:
                return pd_reader(gz, *reader_args, **reader_options)
        elif path.endswith(".bz2"):
            with bz2.open(_r) as bz:
                return pd_reader(bz, *reader_args, **reader_options)
        else:
            return pd_reader(_r, *reader_args, **reader_options)
    def test_write_binary_fileobj(self):
        buffer = io.BytesIO()

        fh = zstd.open(buffer, "wb")
        fh.write(b"foo" * 1024)
        self.assertFalse(fh.closed)
        self.assertFalse(buffer.closed)

        fh.close()
        self.assertTrue(fh.closed)
        self.assertFalse(buffer.closed)
    def test_read_binary_fileobj(self):
        cctx = zstd.ZstdCompressor()
        buffer = io.BytesIO(cctx.compress(b"foo" * 1024))

        fh = zstd.open(buffer, "rb")

        self.assertEqual(fh.read(6), b"foofoo")
        self.assertFalse(fh.closed)
        self.assertFalse(buffer.closed)

        fh.close()
        self.assertTrue(fh.closed)
        self.assertFalse(buffer.closed)

        buffer = io.BytesIO(cctx.compress(b"foo" * 1024))

        with zstd.open(buffer, "rb", closefd=True) as fh:
            self.assertEqual(fh.read(), b"foo" * 1024)

        self.assertTrue(fh.closed)
        self.assertTrue(buffer.closed)
    def test_read_text_fileobj(self):
        cctx = zstd.ZstdCompressor()
        buffer = io.BytesIO(cctx.compress(b"foo\n" * 1024))

        fh = zstd.open(buffer, "r")
        self.assertIsInstance(fh, io.TextIOWrapper)

        self.assertEqual(fh.readline(), "foo\n")

        fh.close()
        self.assertTrue(fh.closed)
        self.assertFalse(buffer.closed)
    def test_read_binary_filename(self):
        with tempfile.TemporaryDirectory() as td:
            p = os.path.join(td, "testfile")
            with open(p, "wb") as fh:
                cctx = zstd.ZstdCompressor()
                fh.write(cctx.compress(b"foo" * 1024))

            fh = zstd.open(p, "rb")

            self.assertEqual(fh.read(6), b"foofoo")
            self.assertEqual(len(fh.read()), 1024 * 3 - 6)
            self.assertFalse(fh.closed)

            fh.close()
            self.assertTrue(fh.closed)
    def test_read_text_filename(self):
        with tempfile.TemporaryDirectory() as td:
            p = os.path.join(td, "testfile")
            cctx = zstd.ZstdCompressor()
            with open(p, "wb") as fh:
                fh.write(cctx.compress(b"foo\n" * 1024))

            fh = zstd.open(p, "r")

            self.assertEqual(fh.read(4), "foo\n")
            self.assertEqual(fh.readline(), "foo\n")
            self.assertFalse(fh.closed)

            fh.close()
            self.assertTrue(fh.closed)
    def __enter__(self):
        blendfile = open(self._filepath, "rb")
        blendfile_base = None
        head = blendfile.read(4)
        blendfile.seek(0)
        if head[0:2] == b'\x1f\x8b':  # GZIP magic.
            import gzip
            blendfile_base = blendfile
            blendfile = gzip.open(blendfile, "rb")
        elif head[0:4] == b'\x28\xb5\x2f\xfd':  # Z-standard magic.
            import zstandard
            blendfile_base = blendfile
            blendfile = zstandard.open(blendfile, "rb")

        self._blendfile_base = blendfile_base
        self._blendfile = blendfile

        return self._blendfile
Exemple #20
0
def save(filename_base: str, figure: matplotlib.pyplot.figure):
    """Save a matplotlib figure as png and a pickle it to a mplf file"""
    figure.savefig(filename_base + '.png')
    with zstandard.open(filename_base + '.mplf', 'wb') as outfile:
        pickle.dump(figure, outfile)
    def test_write_text_fileobj(self):
        buffer = io.BytesIO()

        fh = zstd.open(buffer, "w")
        fh.write("foo")
        fh.write("foo")
Exemple #22
0
def write_file_zstandard(filename="", content=[]):
    with zstandard.open(filename, "w") as f:
        for row in content:
            f.write(row + "/n")
    return []
Exemple #23
0
def write_df(df,
             path,
             copy_paths=[],
             fmt="csv",
             compress_level=6,
             chunksize=None,
             writer_args=[],
             writer_options={},
             zstd_options={"threads": -1},
             open_kw={}):
    """
    Write Pandas DataFrame.

    Can write to local files and to S3 paths in any format, supported by the
    installed pandas version. Writer-specific arguments can be given in
    writer_args and writer_options.
    If the path parameter starts with s3://, it will try to do an S3 write,
    otherwise opens a local file with that path.

    Additional output files can be specified in `copy_paths` parameter, as
    a list of either local, or `s3://...` paths. The same output will be written
    there as to `path` in parallel to reduce overhead.

    Args:
        df (pandas.DataFrame): The DataFrame to write.
        path (str): The path to write to. Can be anything, which `smart_open` supports, like `s3://bucket/file`.

    Kwargs:
        copy_paths (list[str]): Place a copy to these paths as well. Writes in parallel.
        fmt (str): The format to write. Should work with most of Pandas `write_*` methods.
        compress_level (int): Compress level, passed through to the compressor. gzip/bzip2: 1-9, zstd: 1-22.
        chunksize (int): Break DataFrame into `chunksize` sized chunks and write those. 
        writer_args (list): Argument list for the Pandas `write_$fmt` method.
        writer_options (dict): Keyword arguments for the Pandas `write_$fmt` method.
        zstd_options (dict): Keyword arguments for the `zstd` compressor.
        open_kw (dict): Keyword arguments for `smart_open`.
    Returns:
        None
    """
    if compress_level is not None:
        zstd_options["level"] = compress_level

    writer_defaults = {
        "csv": {
            "index": False,
            "encoding": "UTF_8"
        },
        "json": {
            "orient": "records",
            "lines": True,
            "force_ascii": False
        }
    }
    if not writer_options and fmt in writer_defaults:
        writer_options = writer_defaults[fmt]

    filename = os.path.basename(path)
    _files = []
    # support S3 and local writes as well
    for _path in copy_paths + [path]:
        _files.append(open(_path, "wb", compression="disable", **open_kw))

    # depending on the compression status and the mode of the file object,
    # we may stack up to three file objects on top of each other. To track this,
    # we append them in order to fhs, which we'll use to flush/close in the
    # opposite order.
    fhs = [Writer(_files)]

    # if compression is enabled, we open the compression stream on the
    # top of the parallel_write object stored in fhs array's first element
    # and appending the new object to its tail.
    if filename.endswith(".gz"):
        fhs.append(
            gzip.GzipFile(filename,
                          mode="wb",
                          compresslevel=compress_level,
                          fileobj=fhs[0]))
    if filename.endswith(".bz2"):
        fhs.append(bz2.open(fhs[0], mode="wb", compresslevel=compress_level))
    if filename.endswith(".zstd") or filename.endswith(".zst"):
        fhs.append(zstandard.open(fhs[0], mode="wb", closefd=False))
    writer = getattr(df, "to_{}".format(fmt))

    # for writing, we always use the last element in the stack, fhs[-1]
    if fmt in []:
        # add any future pandas writers here, which doesn't implement
        # writing to a (compressed) stream, for eg. because it seeks
        with tempfile.NamedTemporaryFile() as tmpfile:
            writer(tmpfile.name, *writer_args, **writer_options)
            tmpfile.seek(0)
            shutil.copyfileobj(tmpfile, fhs[-1])
    elif fmt == "csv":
        # CSV natively supports chunked writes
        _writer_wrapper(writer, fhs, writer_args,
                        dict(writer_options, chunksize=chunksize))
    elif chunksize and fmt == "json" and writer_options.get(
            "orient") == "records" and writer_options.get("lines"):
        # calculate the number of desired parts
        split_parts = int(max(1, len(df) / chunksize))
        # split the DF into parts
        for _df in np.array_split(df, split_parts):
            writer = getattr(_df, "to_{}".format(fmt))
            _writer_wrapper(writer, fhs, writer_args, writer_options)
            # we have to write a newline after every rounds, so won't get
            # the new round started in the same line
            try:
                # Try to adapt to the required mode by catching TypeError
                # Seems to be more reliable than trying to figure out the
                # binary/text type.
                fhs[-1].write(b"\n")
            except TypeError:
                fhs[-1].write("\n")
    else:
        # in all other cases we're just calling the writer
        _writer_wrapper(writer, fhs, writer_args, writer_options)
    # flush/close all file objects in reverse order
    for f in reversed(fhs):
        f.close()
Exemple #24
0
def zstd_path(tmp_path_factory):
    path = tmp_path_factory.mktemp("data") / "file.zstd"
    data = bytes(FILE_CONTENT, "utf-8")
    with zstd.open(path, "wb") as f:
        f.write(data)
    return path

if __name__ == '__main__':
    for module in ('zstandard', 'indexed_zstd', 'ratarmountcore'):
        if hasattr( sys.modules[module], '__version__' ):
            print(module, "version:", getattr(sys.modules[module], '__version__'))
    print()

    filename = sys.argv[1]
    if filename.endswith('.xz'):
        filename = filename[:-3]
    elif filename.endswith('.zst'):
        filename = filename[:-4]

    if os.path.isfile(filename + '.xz'):
        compareReading(xz.open(filename + '.xz', 'rb'), ParallelXZReader(filename + '.xz', os.cpu_count()))
        benchmarkReading(xz.open(filename + '.xz', 'rb'))
        benchmarkReading(lzma.open(filename + '.xz', 'rb'))
        benchmarkReading(ParallelXZReader(filename + '.xz', os.cpu_count()))

    print()

    if os.path.isfile(filename + '.zst'):
        #simpleParallelZstdReading(filename + '.zst')
        #testZstdSeeking(filename + '.zst')

        compareReading(zstandard.open(filename + '.zst', 'rb'), ParallelZstdReader(filename + '.zst', os.cpu_count()))
        benchmarkReading(zstandard.open(filename + '.zst', 'rb'))
        benchmarkReading(indexed_zstd.IndexedZstdFile(filename + '.zst'))
        benchmarkReading(ParallelZstdReader(filename + '.zst', os.cpu_count()))
Exemple #26
0
def read_zstpkl(filename: pathlib.Path):
    with zstandard.open(filename, 'rb') as f:
        return pickle.load(f)
Exemple #27
0
def read_file_zstandard(filename=""):
    with zstandard.open(filename, "r") as f:  # type: ignore
        yield from f