Beispiel #1
0
def test_ref_loops():
    # Had a bunch of trouble eliminating reference loops in the ZS object.
    # Don't use 'with' statement here b/c that keeps another ref which just
    # confuses things.
    z = ZS(test_data_path("letters-none.zs"))
    try:
        # 1 for 'z', one for the temporary passed to sys.getrefcount
        print(sys.getrefcount(z))
        assert sys.getrefcount(z) == 2
        list(z)
        assert sys.getrefcount(z) == 2
    finally:
        z.close()
Beispiel #2
0
def test_http_zs():
    with web_server(test_data_path()) as root_url:
        codec = "deflate"
        url = "%s/letters-%s.zs" % (root_url, codec)
        for parallelism in [0, 2]:
            with ZS(url=url, parallelism=parallelism) as z:
                check_letters_zs(z, codec)
Beispiel #3
0
def test_block_exec():
    # This function tricky to test in a multiprocessing world, because we need
    # some way to communicate back from the subprocesses that the execution
    # actually happened... instead we just test it in serial
    # mode. (Fortunately it is a super-trivial function.)
    z = ZS(test_data_path("letters-none.zs"), parallelism=0)
    # b/c we're in serial mode, the fn doesn't need to be pickleable
    class CountBlocks(object):
        def __init__(self):
            self.count = 0
        def __call__(self, records):
            self.count += 1
    count_blocks = CountBlocks()
    z.block_exec(count_blocks)
    assert count_blocks.count > 1
    assert count_blocks.count == len(list(z.block_map(identity)))
Beispiel #4
0
def open_zs(opts, **kwargs):
    zs_path_or_url = opts["<zs_file>"]
    if zs_path_or_url.startswith("http"):
        kwargs["url"] = zs_path_or_url
    else:
        kwargs["path"] = zs_path_or_url
    if "__j__" in opts:
        kwargs["parallelism"] = opts["__j__"]
    return ZS(**kwargs)
Beispiel #5
0
def test_zs_close():
    z = ZS(test_data_path("letters-none.zs"))
    z.close()
    for call in [[list, z.search()],
                 [list,
                  z.block_map(_check_raise_helper, AssertionError)],
                 [list, z],
                 [z.dump, BytesIO()],
                 [z.validate],
                 ]:
        print(repr(call))
        assert_raises(ZSError, *call)
    # But calling .close() twice is fine.
    z.close()

    # smoke test for __del__ method
    ZS(test_data_path("letters-none.zs"))
Beispiel #6
0
def test_info():
    with simple_zs() as p:
        out = run(["info", p])
        info = json.loads(out.stdout.decode("ascii"))
        with ZS(p) as z:
            assert info["codec"] == z.codec
            assert binascii.unhexlify(info["data_sha256"]) == z.data_sha256
            assert info["metadata"] == z.metadata

        just_metadata = json.loads(
            run(["info", p, "--metadata-only"]).stdout.decode("ascii"))
        assert info["metadata"] == just_metadata
Beispiel #7
0
def test_big_headers():
    from zs.reader import _lower_header_size_guess
    with _lower_header_size_guess():
        z = ZS(test_data_path("letters-none.zs"))
        assert z.codec == "none"
        assert z.data_sha256 == letters_sha256
        assert z.metadata == {
            u"test-data": u"letters",
            u"build-info": {
                u"user": u"test-user",
                u"host": u"test-host",
                u"time": u"2000-01-01T00:00:00.000000Z",
                u"version": u"zs test",
                },
            }
        assert list(z) == letters_records
Beispiel #8
0
def ok_zs(p):
    z = ZS(p)
    z.validate()
    return z
Beispiel #9
0
def test_make():
    from .test_writer import records as big_records, temp_zs_path

    with simple_zs(big_records) as p_in:
        for format_opt in [
                "--terminator=\\n",
                "--terminator=\\x00",
                "--length-prefixed=uleb128",
                "--length-prefixed=u64le",
        ]:
            input = run(["dump", p_in, format_opt]).stdout
            with temp_zs_path() as p_out:
                run(["make", format_opt, "{}", "-", p_out], input=input)

                with ZS(p_out) as z:
                    z.validate()
                    assert list(z) == big_records

        big_input = b"\n".join(big_records + [b""])

        # smoke test -j
        with temp_zs_path() as p_out:
            run(["make", "{}", "-", p_out, "-j", "3"], input=NEWLINE_RECORDS)

        # --no-spinner produces less chatter
        with temp_zs_path() as p_out:
            r1 = run(["make", "{}", "-", p_out], input=big_input)
        with temp_zs_path() as p_out:
            r2 = run(["make", "{}", "-", p_out, "--no-spinner"],
                     input=big_input)
        assert len(r2.stdout) < len(r1.stdout)

        # codecs and compress level
        # we need some non-trivial input (so the compression algorithms have
        # some work to do), that's large enough for things like window
        # sizes to make a difference.
        r = random.Random(0)
        scrambled_letters = "".join(r.sample("abcdefghijklmnopqrstuvwxyz", 26))
        scrambled_letters = scrambled_letters.encode("ascii")
        pieces = []
        for i in range(200000):
            low = r.randrange(25)
            high = r.randrange(low, 26)
            pieces.append(scrambled_letters[low:high])
        # put a really big piece in to make long-distance memory matter more
        pieces.append(b"m" * 2**18)
        pieces.sort()
        pieces.append(b"")
        bigger_input = b"\n".join(pieces)

        sizes = {}
        for settings in [
                "--codec=none",
                "--codec=deflate",
                "--codec=deflate --compress-level 1",
                "--codec=lzma",
                "--codec=lzma --compress-level 0e",
                "--codec=lzma --compress-level 0",
                "--codec=lzma --compress-level 1e",
                "--codec=lzma --compress-level 1",
        ]:
            with temp_zs_path() as p_out:
                run(
                    [
                        "make",
                        "{}",
                        "-",
                        p_out,
                        # bigger than the lzma -z 0 blocksize of 256k
                        "--approx-block-size",
                        "400000"
                    ] + settings.split(),
                    input=bigger_input)
                sizes[settings] = os.stat(p_out).st_size
        assert (
            sizes["--codec=lzma --compress-level 0e"] == sizes["--codec=lzma"])
        for big, small in [
            ("none", "deflate"),
            ("deflate --compress-level 1", "deflate"),
            ("lzma --compress-level 0", "lzma --compress-level 1"),
            ("lzma --compress-level 0e", "lzma --compress-level 1e"),
            ("lzma --compress-level 0", "lzma --compress-level 0e"),
            ("lzma --compress-level 1", "lzma --compress-level 1e"),
        ]:
            assert sizes["--codec=" + big] > sizes["--codec=" + small]

        # metadata and no-default-metadata
        for no_default in [True, False]:
            with temp_zs_path() as p_out:
                args = []
                if no_default:
                    args.append("--no-default-metadata")
                run(["make", "{\"foo\": 1}", "-", p_out] + args,
                    input=NEWLINE_RECORDS)
                with ZS(p_out) as z:
                    assert z.metadata["foo"] == 1
                    if no_default:
                        assert "build-info" not in z.metadata
                    else:
                        assert "build-info" in z.metadata
        with temp_zs_path() as p_out:
            # bad metadata
            run(["make", "{", "-", p_out],
                input=NEWLINE_RECORDS,
                expected_returncode=2)

        # approx-block-size
        with temp_zs_path() as p_small, temp_zs_path() as p_big:
            run(["make", "{}", "-", p_small, "--approx-block-size", "1000"],
                input=big_input)
            run(["make", "{}", "-", p_big, "--approx-block-size", "10000"],
                input=big_input)

            with ZS(p_small) as z_small, ZS(p_big) as z_big:
                assert list(z_small) == list(z_big)
                # count how many blocks are in each file
                assert (len(list(z_small.block_map(nothing))) > len(
                    list(z_big.block_map(nothing))))

        # branching-factor
        with temp_zs_path() as p_b2, temp_zs_path() as p_b100:
            run([
                "make", "{}", "-", p_b2, "--approx-block-size", "1000",
                "--branching-factor", "2"
            ],
                input=big_input)
            run([
                "make", "{}", "-", p_b100, "--approx-block-size", "1000",
                "--branching-factor", "100"
            ],
                input=big_input)

            with ZS(p_b2) as z_b2, ZS(p_b100) as z_b100:
                assert list(z_b2) == list(z_b100)
                assert z_b2.root_index_level > z_b100.root_index_level

        # from file, not just stdin
        with tempname(".txt") as in_p, temp_zs_path() as out_p:
            with open(in_p, "wb") as in_f:
                in_f.write(big_input)
            run(["make", "{}", in_p, out_p])

            with ZS(out_p) as z:
                assert list(z) == big_records

        # integer checking
        for opt in [
                "--branching-factor", "--approx-block-size",
                "--compress-level", "-z"
        ]:
            with temp_zs_path() as p:
                run(["make", "{}", "-", p, opt, "NOT-AN-INT"],
                    input=NEWLINE_RECORDS,
                    expected_returncode=2)
        # bad json
        with temp_zs_path() as p:
            run(["make", "{}", "-", p, "--metadata", "{"],
                input=NEWLINE_RECORDS,
                expected_returncode=2)
Beispiel #10
0
from zs import ZS
import string
import pickle

table = string.maketrans("", "")
sample = []
with open('stories.txt') as f:
    for line in f.readlines():
        l = line.decode('utf-8').encode('ascii', 'ignore')
        l = l.translate(table, string.punctuation.replace('.', ''))
        l = l.lower().replace('.', ' _END_ _START_')
        sample.append(l)

google1 = ZS(
    '../../Corpus/cpl-data.ucsd.edu/zs/google-books-20120701/eng-us-all/google-books-eng-us-all-20120701-1gram.zs'
)
google2 = ZS(
    '../../Corpus/cpl-data.ucsd.edu/zs/google-books-20120701/eng-us-all/google-books-eng-us-all-20120701-2gram.zs'
)


#  break sentences into strings
def populate(sentences):
    ngra = dict()
    nm1gra = dict()
    for sentence in sentences:
        tokens = ['_START_'] + sentence.split()
        for t in xrange(0, len(tokens) - 2):
            ngra[(tokens[t], tokens[t + 1])] = 0
            nm1gra[tokens[t]] = 0
        ngra[(tokens[len(tokens) - 2], tokens[len(tokens) - 1])] = 0
Beispiel #11
0
def ok_zs(p):
    z = ZS(p)
    z.validate()
    return z
Beispiel #12
0
def test_zs():
    for codec in codec_shorthands:
        p = test_data_path("letters-%s.zs" % (codec,))
        for parallelism in [0, 2, "guess"]:
            with ZS(path=p, parallelism=parallelism) as z:
                check_letters_zs(z, codec)
Beispiel #13
0
def test_broken_files():
    import glob
    unchecked_paths = set(glob.glob(test_data_path("broken-files/*.zs")))
    # Files that should fail even on casual use (no validate)
    for basename, msg_fragment in [
            ("short-root", ["partial read", "root index length"]),
            ("truncated-root", "unexpected EOF"),
            ("bad-magic", "bad magic"),
            ("incomplete-magic", "partially written"),
            ("header-checksum", "header checksum"),
            ("root-checksum", "checksum mismatch"),
            ("bad-codec", "unrecognized compression"),
            ("non-dict-metadata", "bad metadata"),
            ("truncated-data-1", "unexpectedly ran out of data"),
            ("truncated-data-2", "unexpected EOF"),
            ("truncated-data-3", "unexpected EOF"),
            ("wrong-root-offset", ["checksum mismatch", "root block missing"]),
            ("root-is-data", ["expecting index block", "bad level"]),
            ("wrong-root-level-1", ["expecting index block", "bad index ref"]),
            ("partial-data-1", "past end of block"),
            ("partial-data-2", "end of buffer"),
            ("empty-data", "empty block"),
            ("partial-index-1", "end of buffer"),
            ("partial-index-2", "end of buffer"),
            ("partial-index-3", "past end of block"),
            ("partial-index-4", "past end of block"),
            ("empty-index", "empty block"),
            ("bad-total-length", "header says it should"),
            ("bad-level-root", ["extension block", "root block missing"]),
            ("bad-level-index-2", ["extension block", "dangling or multiple refs"]),
            ("post-header-junk", "checksum mismatch"),
            ]:
        print(basename)
        def any_match(mfs, haystack):
            if isinstance(mfs, str):
                mfs = [mfs]
            for mf in mfs:
                if mf in haystack:
                    return True
            return False
        # to prevent accidental false success:
        assert not any_match(msg_fragment, basename)
        p = test_data_path("broken-files/%s.zs" % (basename,))
        with assert_raises(ZSCorrupt) as cm:
            with ZS(p) as z:
                list(z)
                # use start= to ensure that we do an index traversal
                list(z.search(start=b"\x00"))
        assert any_match(msg_fragment, str(cm.exception))
        with assert_raises(ZSCorrupt) as cm:
            with ZS(p) as z:
                z.validate()
        assert any_match(msg_fragment, str(cm.exception))
        unchecked_paths.discard(p)

    # Files that might look okay locally, but validate should detect problems
    for basename, msg_fragment in [
            ("unref-data", "unreferenced"),
            ("unref-index", "unreferenced"),
            ("wrong-root-length", "root index length"),
            ("wrong-root-level-2", "level 3 to level 1"),
            ("repeated-index", "multiple ref"),
            ("bad-ref-length", "!= actual length"),
            ("bad-index-order", "unsorted offsets"),
            ("bad-index-order", "unsorted records"),
            ("bad-data-order", "unsorted records"),
            ("bad-index-key-1", "too large for block"),
            ("bad-index-key-2", "too small for block"),
            ("bad-index-key-3", "too small for block"),
            ("bad-sha256", "data hash mismatch"),
            # not really an accurate message -- this file has a level 1 index
            # pointing to an extension block. the reader doesn't blow up at
            # this because it knows that below a level 1 index is data and
            # switches to streaming read, and then streaming read ignores
            # extension blocks, so only fsck() will catch it. And fsck() uses
            # a streaming read so extension blocks are invisible to it, and
            # all it sees is that there's this reference pointing into an
            # invisible hole in space, which looks like a dangling reference.
            ("bad-level-index-1", "dangling"),
            ]:
        print(basename)
        # to prevent accidental false success:
        assert msg_fragment not in basename
        p = test_data_path("broken-files/%s.zs" % (basename,))
        with ZS(p) as z:
            with assert_raises(ZSCorrupt) as cm:
                z.validate()
        assert msg_fragment in str(cm.exception)
        unchecked_paths.discard(p)

    # Files that are a bit tricky, but should in fact be okay
    for basename in [
            "good-index-key-1",
            "good-index-key-2",
            "good-index-key-3",
            "good-extension-blocks",
            "good-extension-header-fields",
            ]:
        print(basename)
        p = test_data_path("broken-files/%s.zs" % (basename,))
        with ZS(p) as z:
            list(z)
            z.validate()
        unchecked_paths.discard(p)

    assert not unchecked_paths
Beispiel #14
0
def test_context_manager_closes():
    with ZS(test_data_path("letters-none.zs")) as z:
        assert list(z.search()) == letters_records
    assert_raises(ZSError, list, z.search())
Beispiel #15
0
def test_http_notices_lack_of_range_support():
    with web_server(test_data_path(), range_support=False) as root_url:
        codec = "deflate"
        url = "%s/letters-%s.zs" % (root_url, codec)
        assert_raises(ZSError, lambda: list(ZS(url=url)))
Beispiel #16
0
import os
import math as m
from zs import ZS
from check_normality import check_normality
from argparse import ArgumentParser
from pathlib import Path
import re

file = ZS(
    '/om/data/public/corpora/google-books-v2/eng-us-all/google-books-eng-us-all-20120701-2gram.zs'
)
c = 0
for line in file:
    c += 1
    if c == 1000:
        print(line.decode('utf-8'))
file.close()
    replication_POS_word_list[line[0]] = line[1]
for line in file3:
    line = line.decode('utf-8').split()
    pho_syl_len_word_list[line[0]] = line[1:]

ix = {}  # ix[w] = {cnt:0, ix_bg:[], info:0, info_g:[], N:0}
total = 0
bg = {}  # b g['bg'] = 0
cnt = 'cnt'
ix_bg = 'ix_bg'
freq = 'freq'
info = 'info'
info_g = 'info_g'
N = 'N'

file = ZS(input_folder)
c = 0
for line in file:
    c += 1
    if c % 1000000000 == 0:
        print(c, ' ', line.decode('utf-8'))
    line = line.decode('utf-8').split()
    if len(line) < 6 or int(line[3]) < 1960 or int(line[3]) > 2000:
        continue
    for w in line[:-3]:
        if value.match(w):
            w = w.lower()
    tmp_bg = line[0].split("_")[0] + "_" + line[1].split("_")[0]
    tmp_tg = line[2].split("_")[0]
    tmp_cnt = int(line[4])
    if tmp_tg not in ix:
Beispiel #18
0
def test_extension_blocks():
    # Check that the reader happily skips over the extension blocks in the
    # middle of the file.
    with ZS(test_data_path("broken-files/good-extension-blocks.zs")) as z:
        assert list(z) == [b"a", b"b", b"c", b"d"]