def _generate_examples(self, files): """Yield examples as (key, example) tuples.""" key = 0 if isinstance(files, list): import zstandard as zstd for path in files: with zstd.open(open(path, "rb"), "rt", encoding="utf-8") as f: for row in f: data = json.loads(row) yield key, data key += 1 else: for subset in files: if subset == "free_law": import zstandard as zstd with zstd.open(open(files[subset], "rb"), "rt", encoding="utf-8") as f: for row in f: data = json.loads(row) yield key, data key += 1 elif subset == "pubmed_central": for path, file in files[subset]: id_ = path.split("/")[-1].split(".")[0] text = file.read().decode("utf-8") yield key, { "id": id_, "text": text, } key += 1
def extract_word_frequency(json_file: Path) -> None: title_counter: Counter[str] = Counter() body_counter: Counter[str] = Counter() with zst.open(json_file, mode="r", newline="\n") as f: for i, line in enumerate(zst.open(json_file, mode="r", newline="\n")): if i % 100000 == 0: print(f"> {i} {json_file} {len(title_counter)} {len(body_counter)}") title, score, body = json.loads(line) if title is None or body is None: continue title_counter.update(strip_word(w) for w in title.split()) body_counter.update(strip_word(w) for w in strip_body(body).split()) print(f"> done {json_file} {len(title_counter)} {len(body_counter)}")
def _repack( fin: IO[bytes], fout: IO[bytes], index_fout: IO[bytes], chunk_size: int, extractor: Callable, output_compression: str = Compression.GZIP.value, ) -> None: start_offset, end_offset = 0, 0 compressed_chunk = None compression = _determine_compression_from_header(fin) assert compression in SUPPORTED_COMPRESSIONS assert output_compression in SUPPORTED_COMPRESSIONS with _open_compressed_file(fin, compression, mode='rb') as fin: for batch in _batch_iterator(fin, decode_lines=False, batch_size=chunk_size): keys = [] line_indexes = [] chunk = io.BytesIO() compressed_chunk = _open_compressed_file(chunk, output_compression, mode='wb') line_start, line_end = 0, 0 for line in batch: key = extractor(line) keys.append(key) line_start = line_end line_end = line_start + len(line) line_indexes.append('|%s|%s' % (line_start, line_end - line_start)) compressed_chunk.write(line) compressed_chunk.close() fout.write(chunk.getvalue()) fout.flush() start_offset = end_offset end_offset = start_offset + chunk.getbuffer().nbytes for i, key in enumerate(keys): index = '%s|%s|%s' % (key, start_offset, end_offset - start_offset) index += line_indexes[i] index_fout.write(index.encode(_TEXT_ENCODING) + _LINE_TERMINATOR) index_fout.flush() if compressed_chunk is None: # # The input file contained no data. We must write an empty gzip chunk # to make sure the output file is gzip-readable. # fout = io.BytesIO() zstandard.open(fout, mode='wb').write(b'').close() fout.write(fout.getvalue()) fout.flush()
def cached(func: Callable[[],Any], filename: pathlib.Path, rerun: Optional[bool] = False): global RERUN_ALL_CACHED ext = '.pkl.zstd' filename = filename.with_suffix(ext) if not filename.exists() or rerun or RERUN_ALL_CACHED: result = func() if result is None: raise ValueError('The function you passed did not return anything') with zstandard.open(filename, 'wb') as outfile: pickle.dump(result, outfile) else: with zstandard.open(filename, 'rb') as infile: result = pickle.load(infile) return result
def test_write_text_filename(self): with tempfile.TemporaryDirectory() as td: p = os.path.join(td, "testfile") fh = zstd.open(p, "w") self.assertIsInstance(fh, io.TextIOWrapper) fh.write("foo\n") fh.write("bar\n") fh.close() self.assertTrue(fh.closed) with zstd.open(p, "r") as fh: self.assertEqual(fh.read(), "foo\nbar\n")
def _open_compressed_file( path: Union[str, IO], compression: Optional[str], mode: str = 'r' ) -> IO: """Open the specified file for reading/writing. Transparently opens compressed (``.gz``, ``.zst``) files. """ if 'b' not in mode and 't' not in mode: mode += 't' encoding = None if 'b' in mode else _TEXT_ENCODING if compression == Compression.GZIP.value: return cast(IO, gzip.open(path, mode, encoding=encoding)) elif compression == Compression.ZSTD.value: # # zstandard does not support some operations for binary data (e.g. readline) # https://github.com/indygreg/python-zstandard/issues/136 # reader = zstandard.open(path, mode, encoding=encoding) # type: ignore if 'b' in mode: reader = io.BufferedReader(reader) if 'r' in mode else io.BufferedWriter(reader) return cast(IO, reader) else: raise ValueError("Unsupported compression format: %r" % compression)
def zstdopen(cls, name, mode="r", fileobj=None, cctx=None, dctx=None, **kwargs): # type: ignore """Open zstd compressed tar archive name for reading or writing. Appending is not allowed. """ if mode not in ("r"): raise ValueError("mode must be 'r'") try: zobj = zstandard.open(fileobj or name, mode + "b", cctx=cctx, dctx=dctx) with zobj: data = zobj.read() except (zstandard.ZstdError, EOFError) as e: raise tarfile.ReadError("not a zstd file") from e fileobj = io.BytesIO(data) t = cls.taropen(name, mode, fileobj, **kwargs) t._extfileobj = False return t
def zstd(stream): """ Read zstandard compressed files """ import zstandard # type:ignore with zstandard.open(stream, "rb") as file: # type:ignore yield from file.read().split(b"\n")[:-1]
def _generate_examples(self, files): """Yield examples as (key, example) tuples.""" key = 0 if isinstance(files, list): import zstandard as zstd for path in files: with zstd.open(open(path, "rb"), "rt", encoding="utf-8") as f: for row in f: data = json.loads(row) yield key, data key += 1 else: for subset in files: if subset in { "enron_emails", "europarl", "free_law", "nih_exporter", "pubmed", "ubuntu_irc" }: import zstandard as zstd with zstd.open(open(files[subset], "rb"), "rt", encoding="utf-8") as f: for row in f: data = json.loads(row) yield key, data key += 1 elif subset in {"hacker_news", "pubmed_central"}: for path, file in files[subset]: id_ = path.split("/")[-1].split(".")[0] meta = {"id": id_} text = file.read().decode("utf-8") yield key, { "text": text, "meta": meta, } key += 1 elif subset == "uspto": import zstandard as zstd for path, file in files[subset]: with zstd.open(file, "rt", encoding="utf-8") as f: for row in f: data = json.loads(row) yield key, data key += 1
def zstd_file(tmp_path_factory): if config.ZSTANDARD_AVAILABLE: import zstandard as zstd path = tmp_path_factory.mktemp("data") / "file.txt.zst" data = bytes(FILE_CONTENT, "utf-8") with zstd.open(path, "wb") as f: f.write(data) return path
def wiktionary_wordlist(path: Path) -> Wordlist: wordlist = Wordlist() with zst.open(path, mode="r", newline="\n") as f: for line in f: data = json.loads(line.strip()) wordlist.add_word(data["word"], score=1000) for form in data.get("forms", []): wordlist.add_word(form["form"], score=100) return wordlist
def test_write_binary_filename(self): with tempfile.TemporaryDirectory() as td: p = os.path.join(td, "testfile") fh = zstd.open(p, "wb") fh.write(b"foo" * 1024) self.assertFalse(fh.closed) fh.close() self.assertTrue(fh.closed)
def read_df(path, fmt="csv", reader_args=[], reader_options={}, open_kw={}): """Read DataFrame. Args: path (str): The path to read from. Can be anything, which `smart_open` supports, like `s3://bucket/file`. Compression type is inferred Kwargs: fmt (str): The format to read. Should work with most of Pandas `read_*` methods. reader_args (list): Argument list for the Pandas `read_$fmt` method. reader_options (dict): Keyword arguments for the Pandas `read_$fmt` method. open_kw (dict): Keyword arguments for `smart_open`. Returns: The read Pandas DataFrame. """ reader_defaults = { "csv": { "encoding": "UTF_8" }, "json": { "orient": "records", "lines": True } } if not reader_options: reader_options = reader_defaults.get(fmt, {}) pd_reader = getattr(pd, "read_{}".format(fmt)) # pandas could read from S3 and even open some compressed formats, but # for testing (localstack) and consistency, we handle all cases the same: # we open the path with smart_open and stack a decompressor onto it with open(path, "rb", compression="disable", **open_kw) as _r: if path.endswith(".zstd") or path.endswith(".zst"): with zstandard.open(_r) as zs: # these readers try to seek, which is not supported # by the decompressor, so open a temporary file, uncompress data # to it and use that for reading with pandas if fmt in ["parquet", "feather"]: with tempfile.NamedTemporaryFile(delete=False) as tmpfile: shutil.copyfileobj(zs, tmpfile) tmpfile.flush() tmpfile.seek(0) return pd_reader(tmpfile, *reader_args, **reader_options) else: return pd_reader(zs, *reader_args, **reader_options) elif path.endswith(".gz"): with gzip.GzipFile(fileobj=_r) as gz: return pd_reader(gz, *reader_args, **reader_options) elif path.endswith(".bz2"): with bz2.open(_r) as bz: return pd_reader(bz, *reader_args, **reader_options) else: return pd_reader(_r, *reader_args, **reader_options)
def test_write_binary_fileobj(self): buffer = io.BytesIO() fh = zstd.open(buffer, "wb") fh.write(b"foo" * 1024) self.assertFalse(fh.closed) self.assertFalse(buffer.closed) fh.close() self.assertTrue(fh.closed) self.assertFalse(buffer.closed)
def test_read_binary_fileobj(self): cctx = zstd.ZstdCompressor() buffer = io.BytesIO(cctx.compress(b"foo" * 1024)) fh = zstd.open(buffer, "rb") self.assertEqual(fh.read(6), b"foofoo") self.assertFalse(fh.closed) self.assertFalse(buffer.closed) fh.close() self.assertTrue(fh.closed) self.assertFalse(buffer.closed) buffer = io.BytesIO(cctx.compress(b"foo" * 1024)) with zstd.open(buffer, "rb", closefd=True) as fh: self.assertEqual(fh.read(), b"foo" * 1024) self.assertTrue(fh.closed) self.assertTrue(buffer.closed)
def test_read_text_fileobj(self): cctx = zstd.ZstdCompressor() buffer = io.BytesIO(cctx.compress(b"foo\n" * 1024)) fh = zstd.open(buffer, "r") self.assertIsInstance(fh, io.TextIOWrapper) self.assertEqual(fh.readline(), "foo\n") fh.close() self.assertTrue(fh.closed) self.assertFalse(buffer.closed)
def test_read_binary_filename(self): with tempfile.TemporaryDirectory() as td: p = os.path.join(td, "testfile") with open(p, "wb") as fh: cctx = zstd.ZstdCompressor() fh.write(cctx.compress(b"foo" * 1024)) fh = zstd.open(p, "rb") self.assertEqual(fh.read(6), b"foofoo") self.assertEqual(len(fh.read()), 1024 * 3 - 6) self.assertFalse(fh.closed) fh.close() self.assertTrue(fh.closed)
def test_read_text_filename(self): with tempfile.TemporaryDirectory() as td: p = os.path.join(td, "testfile") cctx = zstd.ZstdCompressor() with open(p, "wb") as fh: fh.write(cctx.compress(b"foo\n" * 1024)) fh = zstd.open(p, "r") self.assertEqual(fh.read(4), "foo\n") self.assertEqual(fh.readline(), "foo\n") self.assertFalse(fh.closed) fh.close() self.assertTrue(fh.closed)
def __enter__(self): blendfile = open(self._filepath, "rb") blendfile_base = None head = blendfile.read(4) blendfile.seek(0) if head[0:2] == b'\x1f\x8b': # GZIP magic. import gzip blendfile_base = blendfile blendfile = gzip.open(blendfile, "rb") elif head[0:4] == b'\x28\xb5\x2f\xfd': # Z-standard magic. import zstandard blendfile_base = blendfile blendfile = zstandard.open(blendfile, "rb") self._blendfile_base = blendfile_base self._blendfile = blendfile return self._blendfile
def save(filename_base: str, figure: matplotlib.pyplot.figure): """Save a matplotlib figure as png and a pickle it to a mplf file""" figure.savefig(filename_base + '.png') with zstandard.open(filename_base + '.mplf', 'wb') as outfile: pickle.dump(figure, outfile)
def test_write_text_fileobj(self): buffer = io.BytesIO() fh = zstd.open(buffer, "w") fh.write("foo") fh.write("foo")
def write_file_zstandard(filename="", content=[]): with zstandard.open(filename, "w") as f: for row in content: f.write(row + "/n") return []
def write_df(df, path, copy_paths=[], fmt="csv", compress_level=6, chunksize=None, writer_args=[], writer_options={}, zstd_options={"threads": -1}, open_kw={}): """ Write Pandas DataFrame. Can write to local files and to S3 paths in any format, supported by the installed pandas version. Writer-specific arguments can be given in writer_args and writer_options. If the path parameter starts with s3://, it will try to do an S3 write, otherwise opens a local file with that path. Additional output files can be specified in `copy_paths` parameter, as a list of either local, or `s3://...` paths. The same output will be written there as to `path` in parallel to reduce overhead. Args: df (pandas.DataFrame): The DataFrame to write. path (str): The path to write to. Can be anything, which `smart_open` supports, like `s3://bucket/file`. Kwargs: copy_paths (list[str]): Place a copy to these paths as well. Writes in parallel. fmt (str): The format to write. Should work with most of Pandas `write_*` methods. compress_level (int): Compress level, passed through to the compressor. gzip/bzip2: 1-9, zstd: 1-22. chunksize (int): Break DataFrame into `chunksize` sized chunks and write those. writer_args (list): Argument list for the Pandas `write_$fmt` method. writer_options (dict): Keyword arguments for the Pandas `write_$fmt` method. zstd_options (dict): Keyword arguments for the `zstd` compressor. open_kw (dict): Keyword arguments for `smart_open`. Returns: None """ if compress_level is not None: zstd_options["level"] = compress_level writer_defaults = { "csv": { "index": False, "encoding": "UTF_8" }, "json": { "orient": "records", "lines": True, "force_ascii": False } } if not writer_options and fmt in writer_defaults: writer_options = writer_defaults[fmt] filename = os.path.basename(path) _files = [] # support S3 and local writes as well for _path in copy_paths + [path]: _files.append(open(_path, "wb", compression="disable", **open_kw)) # depending on the compression status and the mode of the file object, # we may stack up to three file objects on top of each other. To track this, # we append them in order to fhs, which we'll use to flush/close in the # opposite order. fhs = [Writer(_files)] # if compression is enabled, we open the compression stream on the # top of the parallel_write object stored in fhs array's first element # and appending the new object to its tail. if filename.endswith(".gz"): fhs.append( gzip.GzipFile(filename, mode="wb", compresslevel=compress_level, fileobj=fhs[0])) if filename.endswith(".bz2"): fhs.append(bz2.open(fhs[0], mode="wb", compresslevel=compress_level)) if filename.endswith(".zstd") or filename.endswith(".zst"): fhs.append(zstandard.open(fhs[0], mode="wb", closefd=False)) writer = getattr(df, "to_{}".format(fmt)) # for writing, we always use the last element in the stack, fhs[-1] if fmt in []: # add any future pandas writers here, which doesn't implement # writing to a (compressed) stream, for eg. because it seeks with tempfile.NamedTemporaryFile() as tmpfile: writer(tmpfile.name, *writer_args, **writer_options) tmpfile.seek(0) shutil.copyfileobj(tmpfile, fhs[-1]) elif fmt == "csv": # CSV natively supports chunked writes _writer_wrapper(writer, fhs, writer_args, dict(writer_options, chunksize=chunksize)) elif chunksize and fmt == "json" and writer_options.get( "orient") == "records" and writer_options.get("lines"): # calculate the number of desired parts split_parts = int(max(1, len(df) / chunksize)) # split the DF into parts for _df in np.array_split(df, split_parts): writer = getattr(_df, "to_{}".format(fmt)) _writer_wrapper(writer, fhs, writer_args, writer_options) # we have to write a newline after every rounds, so won't get # the new round started in the same line try: # Try to adapt to the required mode by catching TypeError # Seems to be more reliable than trying to figure out the # binary/text type. fhs[-1].write(b"\n") except TypeError: fhs[-1].write("\n") else: # in all other cases we're just calling the writer _writer_wrapper(writer, fhs, writer_args, writer_options) # flush/close all file objects in reverse order for f in reversed(fhs): f.close()
def zstd_path(tmp_path_factory): path = tmp_path_factory.mktemp("data") / "file.zstd" data = bytes(FILE_CONTENT, "utf-8") with zstd.open(path, "wb") as f: f.write(data) return path
if __name__ == '__main__': for module in ('zstandard', 'indexed_zstd', 'ratarmountcore'): if hasattr( sys.modules[module], '__version__' ): print(module, "version:", getattr(sys.modules[module], '__version__')) print() filename = sys.argv[1] if filename.endswith('.xz'): filename = filename[:-3] elif filename.endswith('.zst'): filename = filename[:-4] if os.path.isfile(filename + '.xz'): compareReading(xz.open(filename + '.xz', 'rb'), ParallelXZReader(filename + '.xz', os.cpu_count())) benchmarkReading(xz.open(filename + '.xz', 'rb')) benchmarkReading(lzma.open(filename + '.xz', 'rb')) benchmarkReading(ParallelXZReader(filename + '.xz', os.cpu_count())) print() if os.path.isfile(filename + '.zst'): #simpleParallelZstdReading(filename + '.zst') #testZstdSeeking(filename + '.zst') compareReading(zstandard.open(filename + '.zst', 'rb'), ParallelZstdReader(filename + '.zst', os.cpu_count())) benchmarkReading(zstandard.open(filename + '.zst', 'rb')) benchmarkReading(indexed_zstd.IndexedZstdFile(filename + '.zst')) benchmarkReading(ParallelZstdReader(filename + '.zst', os.cpu_count()))
def read_zstpkl(filename: pathlib.Path): with zstandard.open(filename, 'rb') as f: return pickle.load(f)
def read_file_zstandard(filename=""): with zstandard.open(filename, "r") as f: # type: ignore yield from f