Beispiel #1
0
def test_jay_empty_frame(tempfile):
    d0 = dt.Frame()
    d0.save(tempfile, format="jay")
    assert os.path.isfile(tempfile)
    d1 = dt.open(tempfile)
    assert d1.shape == (0, 0)
    assert d1.names == tuple()
Beispiel #2
0
def test_jay_unicode_names(tempfile):
    dt0 = dt.Frame({"py": [1], "ру": [2], "рy": [3], "pу": [4]})
    assert len(set(dt0.names)) == 4
    dt0.save(tempfile, format="jay")
    assert os.path.isfile(tempfile)
    dt1 = dt.open(tempfile)
    assert dt0.names == dt1.names
    assert_equals(dt0, dt1)
Beispiel #3
0
def test_key_save(tempfile):
    dt0 = dt.Frame(D=range(6), A=[3, 7, 5, 2, 2, 3], B=[1, 2, 2, 3, 4, 4])
    dt0.key = ["A", "B"]
    dt0.internal.check()
    dt0.save(tempfile, format="jay")
    dt1 = dt.open(tempfile)
    assert dt1.key == ("A", "B")
    dt1.internal.check()
Beispiel #4
0
def test_key_save(tempfile):
    dt0 = dt.Frame(D=range(6), A=[3, 7, 5, 2, 2, 3], B=[1, 2, 2, 3, 4, 4])
    dt0.key = ["A", "B"]
    frame_integrity_check(dt0)
    dt0.to_jay(tempfile)
    dt1 = dt.open(tempfile)
    assert dt1.key == ("A", "B")
    frame_integrity_check(dt1)
Beispiel #5
0
def test_open(tempfile_jay):
    DT = dt.Frame(A=range(5))
    DT.to_jay(tempfile_jay)
    with pytest.warns(FutureWarning):
        DT2 = dt.open(tempfile_jay)
    assert DT.source is None
    assert DT2.source == tempfile_jay
    assert_equals(DT, DT2)
Beispiel #6
0
def test_issue627():
    """Test saving Frame with unicode file names"""
    dir1 = tempfile.mkdtemp()
    dt0 = dt.Frame({"py": [1], "ру": [2], "рy": [3], "pу": [4]})
    assert dt0.shape == (1, 4)
    dt0.save(dir1)
    dt1 = dt.open(dir1)
    assert_equals(dt0, dt1)
    shutil.rmtree(dir1)
Beispiel #7
0
def test_save_view(tempdir):
    dt0 = dt.Frame([4, 0, -2, 3, 17, 2, 0, 1, 5], names=["fancy"])
    dt1 = dt0.sort(0)
    assert dt1.internal.isview
    dt.save(dt1, tempdir)
    dt2 = dt.open(tempdir)
    assert not dt2.internal.isview
    assert dt2.names == dt1.names
    assert dt2.topython() == dt1.topython()
Beispiel #8
0
def test_rbind_mmapped(tempfile):
    dt0 = dt.Frame({"A": [1, 5, 7], "B": ["one", "two", None]})
    dt0.to_jay(tempfile)
    del dt0
    dt1 = dt.open(tempfile)
    dt2 = dt.Frame({"A": [-1], "B": ["zero"]})
    dt1.rbind(dt2)
    dtr = dt.Frame({"A": [1, 5, 7, -1], "B": ["one", "two", None, "zero"]})
    assert_equals(dt1, dtr)
Beispiel #9
0
def test_h2oai7014(tempfile):
    data = dt.Frame([[None, 't'], [3580, 1047]], names=["ID", "count"])
    data.to_jay(tempfile)
    # The data has to be opened from file
    counts = dt.open(tempfile)
    counts = counts[1:, :]
    counts = counts[:, :, sort("count")]
    counts.materialize()
    assert counts.to_list() == [['t'], [1047]]
Beispiel #10
0
def test_save_and_load():
    dir1 = tempfile.mkdtemp()
    dt0 = dt.Frame({"A": [1, 7, 100, 12],
                    "B": [True, None, False, None],
                    "C": ["alpha", "beta", None, "delta"]})
    dt0.save(dir1)
    dt1 = dt.open(dir1)
    assert_equals(dt0, dt1)
    shutil.rmtree(dir1)
Beispiel #11
0
def test_save_and_load():
    dir1 = tempfile.mkdtemp()
    dt0 = dt.Frame({"A": [1, 7, 100, 12],
                    "B": [True, None, False, None],
                    "C": ["alpha", "beta", None, "delta"]})
    with pytest.warns(FutureWarning):
        dt0.save(dir1, format="nff")
    dt1 = dt.open(dir1)
    assert_equals(dt0, dt1)
    shutil.rmtree(dir1)
Beispiel #12
0
def test_jay_simple(tempfile):
    dt0 = dt.Frame({"A": [-1, 7, 10000, 12],
                    "B": [True, None, False, None],
                    "C": ["alpha", "beta", None, "delta"]})
    dt0.save(tempfile, format="jay")
    assert os.path.isfile(tempfile)
    with open(tempfile, "rb") as inp:
        assert inp.read(8) == b"JAY1\x00\x00\x00\x00"
    dt1 = dt.open(tempfile)
    assert_equals(dt0, dt1)
Beispiel #13
0
def test_jay_keys(tempfile):
    d0 = dt.Frame([["ab", "cd", "eee", "coo", "aop"], [1, 2, 3, 4, 5]],
                  names=("x", "y"))
    d0.key = "x"
    assert len(d0.key) == 1
    assert d0.to_list() == [["ab", "aop", "cd", "coo", "eee"], [1, 5, 2, 4, 3]]
    d0.to_jay(tempfile)
    d1 = dt.open(tempfile)
    assert d1.key == ("x", )
    assert_equals(d0, d1)
Beispiel #14
0
def test_issue1728(tempfile):
    data = dt.Frame({'department1': [None, 't'], 'C0': [3580, 1047]})
    data.to_jay(tempfile)
    del data
    counts = dt.open(tempfile)
    counts = counts[1:, :]
    counts = counts[:, :, dt.sort(-1)]
    counts.materialize()
    frame_integrity_check(counts)
    assert counts.to_dict() == {'department1': ['t'], 'C0': [1047]}
Beispiel #15
0
def test_issue627():
    """Test saving Frame with unicode file names"""
    dir1 = tempfile.mkdtemp()
    dt0 = dt.Frame({"py": [1], "ру": [2], "рy": [3], "pу": [4]})
    assert dt0.shape == (1, 4)
    with pytest.warns(FutureWarning):
        dt0.save(dir1, format="nff")
    dt1 = dt.open(dir1)
    assert_equals(dt0, dt1)
    shutil.rmtree(dir1)
Beispiel #16
0
def test_rbind_mmapped():
    dir0 = tempfile.mkdtemp()
    dt0 = dt.Frame({"A": [1, 5, 7], "B": ["one", "two", None]})
    dt.save(dt0, dir0)
    del dt0
    dt1 = dt.open(dir0)
    dt2 = dt.Frame({"A": [-1], "B": ["zero"]})
    dt1.rbind(dt2)
    dtr = dt.Frame({"A": [1, 5, 7, -1], "B": ["one", "two", None, "zero"]})
    assert_equals(dt1, dtr)
    shutil.rmtree(dir0)
Beispiel #17
0
def test_issue689(tempdir):
    n = 300000  # Must be > 65536
    data = [i % 8 for i in range(n)]
    d0 = dt.Frame(data, names=["A"])
    dt.save(d0, tempdir)
    del d0
    d1 = dt.open(tempdir)
    # Do not check d1! we want it to be lazy at this point
    d2 = d1(rows=lambda g: g[0] == 1)
    assert d2.internal.check()
    assert d2.shape == (n / 8, 1)
Beispiel #18
0
def test_issue689(tempfile):
    n = 300000  # Must be > 65536
    data = [i % 8 for i in range(n)]
    d0 = dt.Frame(data, names=["A"])
    d0.to_jay(tempfile)
    del d0
    d1 = dt.open(tempfile)
    # Do not check d1! we want it to be lazy at this point
    d2 = d1[f[0] == 1, :]
    frame_integrity_check(d2)
    assert d2.shape == (n / 8, 1)
Beispiel #19
0
def test_empty_string_col():
    """
    Test that Frame with an empty string column can be saved/opened.
    See #604
    """
    dir1 = tempfile.mkdtemp()
    dt0 = dt.Frame([[1, 2, 3], ["", "", ""]])
    dt0.save(dir1)
    dt1 = dt.open(dir1)
    assert_equals(dt0, dt1)
    shutil.rmtree(dir1)
Beispiel #20
0
def test_save_view(tempdir):
    dt0 = dt.Frame([4, 0, -2, 3, 17, 2, 0, 1, 5], names=["fancy"])
    dt1 = dt0.sort(0)
    assert dt1.internal.isview
    dt1.internal.check()
    dt1.save(tempdir, format="nff")
    dt2 = dt.open(tempdir)
    assert not dt2.internal.isview
    dt2.internal.check()
    assert dt2.names == dt1.names
    assert dt2.to_list() == dt1.to_list()
Beispiel #21
0
def test_empty_string_col():
    """
    Test that Frame with an empty string column can be saved/opened.
    See #604
    """
    dir1 = tempfile.mkdtemp()
    dt0 = dt.Frame([[1, 2, 3], ["", "", ""]])
    with pytest.warns(FutureWarning):
        dt0.save(dir1, format="nff")
    dt1 = dt.open(dir1)
    assert_equals(dt0, dt1)
    shutil.rmtree(dir1)
Beispiel #22
0
def test_save_view(tempdir):
    dt0 = dt.Frame([4, 0, -2, 3, 17, 2, 0, 1, 5], names=["fancy"])
    dt1 = dt0.sort(0)
    assert isview(dt1)
    frame_integrity_check(dt1)
    with pytest.warns(FutureWarning):
        dt1.save(tempdir, format="nff")
    dt2 = dt.open(tempdir)
    assert not isview(dt2)
    frame_integrity_check(dt2)
    assert dt2.names == dt1.names
    assert dt2.to_list() == dt1.to_list()
Beispiel #23
0
def test_jay_object_columns(tempfile):
    src1 = [1, 2, 3, 4]
    src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}]
    d0 = dt.Frame([src1, src2], names=["A", "B"])
    assert d0.stypes == (dt.int32, dt.obj64)
    with pytest.warns(DatatableWarning) as ws:
        d0.to_jay(tempfile)
    assert len(ws) == 1
    assert "Column `B` of type obj64 was not saved" in ws[0].message.args[0]
    d1 = dt.open(tempfile)
    frame_integrity_check(d1)
    assert d1.names == ("A", )
    assert d1.to_list() == [src1]
Beispiel #24
0
def test_jay_object_columns(tempfile):
    src1 = [1, 2, 3, 4]
    src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}]
    d0 = dt.Frame([src1, src2], names=["A", "B"])
    assert d0.stypes == (dt.int8, dt.obj64)
    with pytest.warns(DatatableWarning) as ws:
        d0.save(tempfile, format="jay")
    assert len(ws) == 1
    assert "Column `B` of type obj64 was not saved" in ws[0].message.args[0]
    d1 = dt.open(tempfile)
    d1.internal.check()
    assert d1.names == ("A",)
    assert d1.topython() == [src1]
Beispiel #25
0
def test_jay_view(tempfile, seed):
    random.seed(seed)
    src = [random.normalvariate(0, 1) for n in range(1000)]
    dt0 = dt.Frame({"values": src})
    dt1 = dt0.sort(0)
    assert isview(dt1)
    dt1.to_jay(tempfile)
    assert os.path.isfile(tempfile)
    dt2 = dt.open(tempfile)
    assert not isview(dt2)
    frame_integrity_check(dt1)
    frame_integrity_check(dt2)
    assert dt1.names == dt2.names
    assert dt1.stypes == dt2.stypes
    assert dt1.to_list() == dt2.to_list()
Beispiel #26
0
def test_jay_view(tempfile, seed):
    random.seed(seed)
    src = [random.normalvariate(0, 1) for n in range(1000)]
    dt0 = dt.Frame({"values": src})
    dt1 = dt0.sort(0)
    assert dt1.internal.isview
    dt1.save(tempfile, format="jay")
    assert os.path.isfile(tempfile)
    dt2 = dt.open(tempfile)
    assert not dt2.internal.isview
    dt1.internal.check()
    dt2.internal.check()
    assert dt1.names == dt2.names
    assert dt1.stypes == dt2.stypes
    assert dt1.topython() == dt2.topython()
Beispiel #27
0
def test_obj_columns(tempdir):
    src1 = [1, 2, 3, 4]
    src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}]
    d0 = dt.Frame([src1, src2], names=["A", "B"])
    d0.internal.check()
    assert d0.ltypes == (dt.ltype.int, dt.ltype.obj)
    assert d0.shape == (4, 2)
    with pytest.warns(DatatableWarning) as ws:
        d0.save(tempdir)
    assert len(ws) == 1
    assert "Column 'B' of type obj64 was not saved" in ws[0].message.args[0]
    del d0
    d1 = dt.open(tempdir)
    d1.internal.check()
    assert d1.shape == (4, 1)
    assert d1.names == ("A", )
    assert d1.topython() == [src1]
Beispiel #28
0
def test_obj_columns(tempdir):
    src1 = [1, 2, 3, 4]
    src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}]
    d0 = dt.Frame([src1, src2], names=["A", "B"])
    frame_integrity_check(d0)
    assert d0.ltypes == (dt.ltype.int, dt.ltype.obj)
    assert d0.shape == (4, 2)
    with pytest.warns(DatatableWarning) as ws:
        d0.save(tempdir, format="nff")
    assert len(ws) == 2
    assert "Method `Frame.save()` is deprecated" in ws[0].message.args[0]
    assert "Column 'B' of type obj64 was not saved" in ws[1].message.args[0]
    del d0
    d1 = dt.open(tempdir)
    frame_integrity_check(d1)
    assert d1.shape == (4, 1)
    assert d1.names == ("A", )
    assert d1.to_list() == [src1]
Beispiel #29
0
def test_jay_all_types(tempfile):
    d0 = dt.Frame(
        [[True, False, None, True, True], [None, 1, -9, 12, 3],
         [4, 1346, 999, None, None], [591, 0, None, -395734, 19384709],
         [None, 777, 1093487019384, -384, None],
         [2.987, 3.45e-24, -0.189134e+12, 45982.1, None],
         [39408.301, 9.459027045e-125, 4.4508e+222, None, 3.14159],
         ["Life", "Liberty", "and", "Pursuit of Happiness", None],
         ["кохайтеся", "чорнобриві", ",", "та", "не з москалями"]],
        stypes=[
            dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32,
            dt.float64, dt.str32, dt.str64
        ],
        names=["b8", "i8", "i16", "i32", "i64", "f32", "f64", "s32", "s64"])
    # Force calculation of mins and maxs, so that they get saved into Jay
    d0.min(), d0.max()
    assert len(set(d0.stypes)) == d0.ncols
    d0.save(tempfile, format="jay")
    assert os.path.isfile(tempfile)
    d1 = dt.open(tempfile)
    assert_equals(d0, d1)
git = pd.__git_version__
task = "groupby"
solution = "pandas"
fun = ".groupby"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name + ".jay")
print("loading dataset %s" % data_name, flush=True)

## failed attempt to improve read_csv to read 1e9 rows #99
#dtype = {"id1": "category", "id2": "category", "id3": "category", "id4": "int32", "id5": "int32", "id6": "int32", "v1": "int32", "v2": "int32", "v3": "float64"}
#x = pd.read_csv(src_grp, dtype=dtype, engine="c", low_memory=True)

import datatable as dt  # for loading data only, see #47
x = dt.open(src_grp).to_pandas()
x['id1'] = x['id1'].astype('category')  # remove after datatable#1691
x['id2'] = x['id2'].astype('category')
x['id3'] = x['id3'].astype('category')
print(len(x.index), flush=True)

print("grouping...", flush=True)

question = "sum v1 by id1"  # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(['id1']).agg({'v1': 'sum'})
ans.reset_index(inplace=True)  # #68
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Beispiel #31
0
exec(open("./helpers.py").read())

ver = pd.__version__
git = pd.__git_version__
task = "groupby"
solution = "pandas"
fun = ".groupby"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name+".jay")
print("loading dataset %s" % data_name, flush=True)

import datatable as dt # for loading data only, see #47
x = dt.open(src_grp).to_pandas()
x['id1'] = x['id1'].astype('category')
x['id2'] = x['id2'].astype('category')
x['id3'] = x['id3'].astype('category')
print(len(x.index), flush=True)

print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(['id1']).agg({'v1':'sum'})
ans.reset_index(inplace=True) # #68
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()