def test_ref_loops(): # Had a bunch of trouble eliminating reference loops in the ZS object. # Don't use 'with' statement here b/c that keeps another ref which just # confuses things. z = ZS(test_data_path("letters-none.zs")) try: # 1 for 'z', one for the temporary passed to sys.getrefcount print(sys.getrefcount(z)) assert sys.getrefcount(z) == 2 list(z) assert sys.getrefcount(z) == 2 finally: z.close()
def test_http_zs(): with web_server(test_data_path()) as root_url: codec = "deflate" url = "%s/letters-%s.zs" % (root_url, codec) for parallelism in [0, 2]: with ZS(url=url, parallelism=parallelism) as z: check_letters_zs(z, codec)
def test_block_exec(): # This function tricky to test in a multiprocessing world, because we need # some way to communicate back from the subprocesses that the execution # actually happened... instead we just test it in serial # mode. (Fortunately it is a super-trivial function.) z = ZS(test_data_path("letters-none.zs"), parallelism=0) # b/c we're in serial mode, the fn doesn't need to be pickleable class CountBlocks(object): def __init__(self): self.count = 0 def __call__(self, records): self.count += 1 count_blocks = CountBlocks() z.block_exec(count_blocks) assert count_blocks.count > 1 assert count_blocks.count == len(list(z.block_map(identity)))
def open_zs(opts, **kwargs): zs_path_or_url = opts["<zs_file>"] if zs_path_or_url.startswith("http"): kwargs["url"] = zs_path_or_url else: kwargs["path"] = zs_path_or_url if "__j__" in opts: kwargs["parallelism"] = opts["__j__"] return ZS(**kwargs)
def test_zs_close(): z = ZS(test_data_path("letters-none.zs")) z.close() for call in [[list, z.search()], [list, z.block_map(_check_raise_helper, AssertionError)], [list, z], [z.dump, BytesIO()], [z.validate], ]: print(repr(call)) assert_raises(ZSError, *call) # But calling .close() twice is fine. z.close() # smoke test for __del__ method ZS(test_data_path("letters-none.zs"))
def test_info(): with simple_zs() as p: out = run(["info", p]) info = json.loads(out.stdout.decode("ascii")) with ZS(p) as z: assert info["codec"] == z.codec assert binascii.unhexlify(info["data_sha256"]) == z.data_sha256 assert info["metadata"] == z.metadata just_metadata = json.loads( run(["info", p, "--metadata-only"]).stdout.decode("ascii")) assert info["metadata"] == just_metadata
def test_big_headers(): from zs.reader import _lower_header_size_guess with _lower_header_size_guess(): z = ZS(test_data_path("letters-none.zs")) assert z.codec == "none" assert z.data_sha256 == letters_sha256 assert z.metadata == { u"test-data": u"letters", u"build-info": { u"user": u"test-user", u"host": u"test-host", u"time": u"2000-01-01T00:00:00.000000Z", u"version": u"zs test", }, } assert list(z) == letters_records
def ok_zs(p): z = ZS(p) z.validate() return z
def test_make(): from .test_writer import records as big_records, temp_zs_path with simple_zs(big_records) as p_in: for format_opt in [ "--terminator=\\n", "--terminator=\\x00", "--length-prefixed=uleb128", "--length-prefixed=u64le", ]: input = run(["dump", p_in, format_opt]).stdout with temp_zs_path() as p_out: run(["make", format_opt, "{}", "-", p_out], input=input) with ZS(p_out) as z: z.validate() assert list(z) == big_records big_input = b"\n".join(big_records + [b""]) # smoke test -j with temp_zs_path() as p_out: run(["make", "{}", "-", p_out, "-j", "3"], input=NEWLINE_RECORDS) # --no-spinner produces less chatter with temp_zs_path() as p_out: r1 = run(["make", "{}", "-", p_out], input=big_input) with temp_zs_path() as p_out: r2 = run(["make", "{}", "-", p_out, "--no-spinner"], input=big_input) assert len(r2.stdout) < len(r1.stdout) # codecs and compress level # we need some non-trivial input (so the compression algorithms have # some work to do), that's large enough for things like window # sizes to make a difference. r = random.Random(0) scrambled_letters = "".join(r.sample("abcdefghijklmnopqrstuvwxyz", 26)) scrambled_letters = scrambled_letters.encode("ascii") pieces = [] for i in range(200000): low = r.randrange(25) high = r.randrange(low, 26) pieces.append(scrambled_letters[low:high]) # put a really big piece in to make long-distance memory matter more pieces.append(b"m" * 2**18) pieces.sort() pieces.append(b"") bigger_input = b"\n".join(pieces) sizes = {} for settings in [ "--codec=none", "--codec=deflate", "--codec=deflate --compress-level 1", "--codec=lzma", "--codec=lzma --compress-level 0e", "--codec=lzma --compress-level 0", "--codec=lzma --compress-level 1e", "--codec=lzma --compress-level 1", ]: with temp_zs_path() as p_out: run( [ "make", "{}", "-", p_out, # bigger than the lzma -z 0 blocksize of 256k "--approx-block-size", "400000" ] + settings.split(), input=bigger_input) sizes[settings] = os.stat(p_out).st_size assert ( sizes["--codec=lzma --compress-level 0e"] == sizes["--codec=lzma"]) for big, small in [ ("none", "deflate"), ("deflate --compress-level 1", "deflate"), ("lzma --compress-level 0", "lzma --compress-level 1"), ("lzma --compress-level 0e", "lzma --compress-level 1e"), ("lzma --compress-level 0", "lzma --compress-level 0e"), ("lzma --compress-level 1", "lzma --compress-level 1e"), ]: assert sizes["--codec=" + big] > sizes["--codec=" + small] # metadata and no-default-metadata for no_default in [True, False]: with temp_zs_path() as p_out: args = [] if no_default: args.append("--no-default-metadata") run(["make", "{\"foo\": 1}", "-", p_out] + args, input=NEWLINE_RECORDS) with ZS(p_out) as z: assert z.metadata["foo"] == 1 if no_default: assert "build-info" not in z.metadata else: assert "build-info" in z.metadata with temp_zs_path() as p_out: # bad metadata run(["make", "{", "-", p_out], input=NEWLINE_RECORDS, expected_returncode=2) # approx-block-size with temp_zs_path() as p_small, temp_zs_path() as p_big: run(["make", "{}", "-", p_small, "--approx-block-size", "1000"], input=big_input) run(["make", "{}", "-", p_big, "--approx-block-size", "10000"], input=big_input) with ZS(p_small) as z_small, ZS(p_big) as z_big: assert list(z_small) == list(z_big) # count how many blocks are in each file assert (len(list(z_small.block_map(nothing))) > len( list(z_big.block_map(nothing)))) # branching-factor with temp_zs_path() as p_b2, temp_zs_path() as p_b100: run([ "make", "{}", "-", p_b2, "--approx-block-size", "1000", "--branching-factor", "2" ], input=big_input) run([ "make", "{}", "-", p_b100, "--approx-block-size", "1000", "--branching-factor", "100" ], input=big_input) with ZS(p_b2) as z_b2, ZS(p_b100) as z_b100: assert list(z_b2) == list(z_b100) assert z_b2.root_index_level > z_b100.root_index_level # from file, not just stdin with tempname(".txt") as in_p, temp_zs_path() as out_p: with open(in_p, "wb") as in_f: in_f.write(big_input) run(["make", "{}", in_p, out_p]) with ZS(out_p) as z: assert list(z) == big_records # integer checking for opt in [ "--branching-factor", "--approx-block-size", "--compress-level", "-z" ]: with temp_zs_path() as p: run(["make", "{}", "-", p, opt, "NOT-AN-INT"], input=NEWLINE_RECORDS, expected_returncode=2) # bad json with temp_zs_path() as p: run(["make", "{}", "-", p, "--metadata", "{"], input=NEWLINE_RECORDS, expected_returncode=2)
from zs import ZS import string import pickle table = string.maketrans("", "") sample = [] with open('stories.txt') as f: for line in f.readlines(): l = line.decode('utf-8').encode('ascii', 'ignore') l = l.translate(table, string.punctuation.replace('.', '')) l = l.lower().replace('.', ' _END_ _START_') sample.append(l) google1 = ZS( '../../Corpus/cpl-data.ucsd.edu/zs/google-books-20120701/eng-us-all/google-books-eng-us-all-20120701-1gram.zs' ) google2 = ZS( '../../Corpus/cpl-data.ucsd.edu/zs/google-books-20120701/eng-us-all/google-books-eng-us-all-20120701-2gram.zs' ) # break sentences into strings def populate(sentences): ngra = dict() nm1gra = dict() for sentence in sentences: tokens = ['_START_'] + sentence.split() for t in xrange(0, len(tokens) - 2): ngra[(tokens[t], tokens[t + 1])] = 0 nm1gra[tokens[t]] = 0 ngra[(tokens[len(tokens) - 2], tokens[len(tokens) - 1])] = 0
def test_zs(): for codec in codec_shorthands: p = test_data_path("letters-%s.zs" % (codec,)) for parallelism in [0, 2, "guess"]: with ZS(path=p, parallelism=parallelism) as z: check_letters_zs(z, codec)
def test_broken_files(): import glob unchecked_paths = set(glob.glob(test_data_path("broken-files/*.zs"))) # Files that should fail even on casual use (no validate) for basename, msg_fragment in [ ("short-root", ["partial read", "root index length"]), ("truncated-root", "unexpected EOF"), ("bad-magic", "bad magic"), ("incomplete-magic", "partially written"), ("header-checksum", "header checksum"), ("root-checksum", "checksum mismatch"), ("bad-codec", "unrecognized compression"), ("non-dict-metadata", "bad metadata"), ("truncated-data-1", "unexpectedly ran out of data"), ("truncated-data-2", "unexpected EOF"), ("truncated-data-3", "unexpected EOF"), ("wrong-root-offset", ["checksum mismatch", "root block missing"]), ("root-is-data", ["expecting index block", "bad level"]), ("wrong-root-level-1", ["expecting index block", "bad index ref"]), ("partial-data-1", "past end of block"), ("partial-data-2", "end of buffer"), ("empty-data", "empty block"), ("partial-index-1", "end of buffer"), ("partial-index-2", "end of buffer"), ("partial-index-3", "past end of block"), ("partial-index-4", "past end of block"), ("empty-index", "empty block"), ("bad-total-length", "header says it should"), ("bad-level-root", ["extension block", "root block missing"]), ("bad-level-index-2", ["extension block", "dangling or multiple refs"]), ("post-header-junk", "checksum mismatch"), ]: print(basename) def any_match(mfs, haystack): if isinstance(mfs, str): mfs = [mfs] for mf in mfs: if mf in haystack: return True return False # to prevent accidental false success: assert not any_match(msg_fragment, basename) p = test_data_path("broken-files/%s.zs" % (basename,)) with assert_raises(ZSCorrupt) as cm: with ZS(p) as z: list(z) # use start= to ensure that we do an index traversal list(z.search(start=b"\x00")) assert any_match(msg_fragment, str(cm.exception)) with assert_raises(ZSCorrupt) as cm: with ZS(p) as z: z.validate() assert any_match(msg_fragment, str(cm.exception)) unchecked_paths.discard(p) # Files that might look okay locally, but validate should detect problems for basename, msg_fragment in [ ("unref-data", "unreferenced"), ("unref-index", "unreferenced"), ("wrong-root-length", "root index length"), ("wrong-root-level-2", "level 3 to level 1"), ("repeated-index", "multiple ref"), ("bad-ref-length", "!= actual length"), ("bad-index-order", "unsorted offsets"), ("bad-index-order", "unsorted records"), ("bad-data-order", "unsorted records"), ("bad-index-key-1", "too large for block"), ("bad-index-key-2", "too small for block"), ("bad-index-key-3", "too small for block"), ("bad-sha256", "data hash mismatch"), # not really an accurate message -- this file has a level 1 index # pointing to an extension block. the reader doesn't blow up at # this because it knows that below a level 1 index is data and # switches to streaming read, and then streaming read ignores # extension blocks, so only fsck() will catch it. And fsck() uses # a streaming read so extension blocks are invisible to it, and # all it sees is that there's this reference pointing into an # invisible hole in space, which looks like a dangling reference. ("bad-level-index-1", "dangling"), ]: print(basename) # to prevent accidental false success: assert msg_fragment not in basename p = test_data_path("broken-files/%s.zs" % (basename,)) with ZS(p) as z: with assert_raises(ZSCorrupt) as cm: z.validate() assert msg_fragment in str(cm.exception) unchecked_paths.discard(p) # Files that are a bit tricky, but should in fact be okay for basename in [ "good-index-key-1", "good-index-key-2", "good-index-key-3", "good-extension-blocks", "good-extension-header-fields", ]: print(basename) p = test_data_path("broken-files/%s.zs" % (basename,)) with ZS(p) as z: list(z) z.validate() unchecked_paths.discard(p) assert not unchecked_paths
def test_context_manager_closes(): with ZS(test_data_path("letters-none.zs")) as z: assert list(z.search()) == letters_records assert_raises(ZSError, list, z.search())
def test_http_notices_lack_of_range_support(): with web_server(test_data_path(), range_support=False) as root_url: codec = "deflate" url = "%s/letters-%s.zs" % (root_url, codec) assert_raises(ZSError, lambda: list(ZS(url=url)))
import os import math as m from zs import ZS from check_normality import check_normality from argparse import ArgumentParser from pathlib import Path import re file = ZS( '/om/data/public/corpora/google-books-v2/eng-us-all/google-books-eng-us-all-20120701-2gram.zs' ) c = 0 for line in file: c += 1 if c == 1000: print(line.decode('utf-8')) file.close()
replication_POS_word_list[line[0]] = line[1] for line in file3: line = line.decode('utf-8').split() pho_syl_len_word_list[line[0]] = line[1:] ix = {} # ix[w] = {cnt:0, ix_bg:[], info:0, info_g:[], N:0} total = 0 bg = {} # b g['bg'] = 0 cnt = 'cnt' ix_bg = 'ix_bg' freq = 'freq' info = 'info' info_g = 'info_g' N = 'N' file = ZS(input_folder) c = 0 for line in file: c += 1 if c % 1000000000 == 0: print(c, ' ', line.decode('utf-8')) line = line.decode('utf-8').split() if len(line) < 6 or int(line[3]) < 1960 or int(line[3]) > 2000: continue for w in line[:-3]: if value.match(w): w = w.lower() tmp_bg = line[0].split("_")[0] + "_" + line[1].split("_")[0] tmp_tg = line[2].split("_")[0] tmp_cnt = int(line[4]) if tmp_tg not in ix:
def test_extension_blocks(): # Check that the reader happily skips over the extension blocks in the # middle of the file. with ZS(test_data_path("broken-files/good-extension-blocks.zs")) as z: assert list(z) == [b"a", b"b", b"c", b"d"]