def test_jay_empty_frame(tempfile): d0 = dt.Frame() d0.save(tempfile, format="jay") assert os.path.isfile(tempfile) d1 = dt.open(tempfile) assert d1.shape == (0, 0) assert d1.names == tuple()
def test_jay_unicode_names(tempfile): dt0 = dt.Frame({"py": [1], "ру": [2], "рy": [3], "pу": [4]}) assert len(set(dt0.names)) == 4 dt0.save(tempfile, format="jay") assert os.path.isfile(tempfile) dt1 = dt.open(tempfile) assert dt0.names == dt1.names assert_equals(dt0, dt1)
def test_key_save(tempfile): dt0 = dt.Frame(D=range(6), A=[3, 7, 5, 2, 2, 3], B=[1, 2, 2, 3, 4, 4]) dt0.key = ["A", "B"] dt0.internal.check() dt0.save(tempfile, format="jay") dt1 = dt.open(tempfile) assert dt1.key == ("A", "B") dt1.internal.check()
def test_key_save(tempfile): dt0 = dt.Frame(D=range(6), A=[3, 7, 5, 2, 2, 3], B=[1, 2, 2, 3, 4, 4]) dt0.key = ["A", "B"] frame_integrity_check(dt0) dt0.to_jay(tempfile) dt1 = dt.open(tempfile) assert dt1.key == ("A", "B") frame_integrity_check(dt1)
def test_open(tempfile_jay): DT = dt.Frame(A=range(5)) DT.to_jay(tempfile_jay) with pytest.warns(FutureWarning): DT2 = dt.open(tempfile_jay) assert DT.source is None assert DT2.source == tempfile_jay assert_equals(DT, DT2)
def test_issue627(): """Test saving Frame with unicode file names""" dir1 = tempfile.mkdtemp() dt0 = dt.Frame({"py": [1], "ру": [2], "рy": [3], "pу": [4]}) assert dt0.shape == (1, 4) dt0.save(dir1) dt1 = dt.open(dir1) assert_equals(dt0, dt1) shutil.rmtree(dir1)
def test_save_view(tempdir): dt0 = dt.Frame([4, 0, -2, 3, 17, 2, 0, 1, 5], names=["fancy"]) dt1 = dt0.sort(0) assert dt1.internal.isview dt.save(dt1, tempdir) dt2 = dt.open(tempdir) assert not dt2.internal.isview assert dt2.names == dt1.names assert dt2.topython() == dt1.topython()
def test_rbind_mmapped(tempfile): dt0 = dt.Frame({"A": [1, 5, 7], "B": ["one", "two", None]}) dt0.to_jay(tempfile) del dt0 dt1 = dt.open(tempfile) dt2 = dt.Frame({"A": [-1], "B": ["zero"]}) dt1.rbind(dt2) dtr = dt.Frame({"A": [1, 5, 7, -1], "B": ["one", "two", None, "zero"]}) assert_equals(dt1, dtr)
def test_h2oai7014(tempfile): data = dt.Frame([[None, 't'], [3580, 1047]], names=["ID", "count"]) data.to_jay(tempfile) # The data has to be opened from file counts = dt.open(tempfile) counts = counts[1:, :] counts = counts[:, :, sort("count")] counts.materialize() assert counts.to_list() == [['t'], [1047]]
def test_save_and_load(): dir1 = tempfile.mkdtemp() dt0 = dt.Frame({"A": [1, 7, 100, 12], "B": [True, None, False, None], "C": ["alpha", "beta", None, "delta"]}) dt0.save(dir1) dt1 = dt.open(dir1) assert_equals(dt0, dt1) shutil.rmtree(dir1)
def test_save_and_load(): dir1 = tempfile.mkdtemp() dt0 = dt.Frame({"A": [1, 7, 100, 12], "B": [True, None, False, None], "C": ["alpha", "beta", None, "delta"]}) with pytest.warns(FutureWarning): dt0.save(dir1, format="nff") dt1 = dt.open(dir1) assert_equals(dt0, dt1) shutil.rmtree(dir1)
def test_jay_simple(tempfile): dt0 = dt.Frame({"A": [-1, 7, 10000, 12], "B": [True, None, False, None], "C": ["alpha", "beta", None, "delta"]}) dt0.save(tempfile, format="jay") assert os.path.isfile(tempfile) with open(tempfile, "rb") as inp: assert inp.read(8) == b"JAY1\x00\x00\x00\x00" dt1 = dt.open(tempfile) assert_equals(dt0, dt1)
def test_jay_keys(tempfile): d0 = dt.Frame([["ab", "cd", "eee", "coo", "aop"], [1, 2, 3, 4, 5]], names=("x", "y")) d0.key = "x" assert len(d0.key) == 1 assert d0.to_list() == [["ab", "aop", "cd", "coo", "eee"], [1, 5, 2, 4, 3]] d0.to_jay(tempfile) d1 = dt.open(tempfile) assert d1.key == ("x", ) assert_equals(d0, d1)
def test_issue1728(tempfile): data = dt.Frame({'department1': [None, 't'], 'C0': [3580, 1047]}) data.to_jay(tempfile) del data counts = dt.open(tempfile) counts = counts[1:, :] counts = counts[:, :, dt.sort(-1)] counts.materialize() frame_integrity_check(counts) assert counts.to_dict() == {'department1': ['t'], 'C0': [1047]}
def test_issue627(): """Test saving Frame with unicode file names""" dir1 = tempfile.mkdtemp() dt0 = dt.Frame({"py": [1], "ру": [2], "рy": [3], "pу": [4]}) assert dt0.shape == (1, 4) with pytest.warns(FutureWarning): dt0.save(dir1, format="nff") dt1 = dt.open(dir1) assert_equals(dt0, dt1) shutil.rmtree(dir1)
def test_rbind_mmapped(): dir0 = tempfile.mkdtemp() dt0 = dt.Frame({"A": [1, 5, 7], "B": ["one", "two", None]}) dt.save(dt0, dir0) del dt0 dt1 = dt.open(dir0) dt2 = dt.Frame({"A": [-1], "B": ["zero"]}) dt1.rbind(dt2) dtr = dt.Frame({"A": [1, 5, 7, -1], "B": ["one", "two", None, "zero"]}) assert_equals(dt1, dtr) shutil.rmtree(dir0)
def test_issue689(tempdir): n = 300000 # Must be > 65536 data = [i % 8 for i in range(n)] d0 = dt.Frame(data, names=["A"]) dt.save(d0, tempdir) del d0 d1 = dt.open(tempdir) # Do not check d1! we want it to be lazy at this point d2 = d1(rows=lambda g: g[0] == 1) assert d2.internal.check() assert d2.shape == (n / 8, 1)
def test_issue689(tempfile): n = 300000 # Must be > 65536 data = [i % 8 for i in range(n)] d0 = dt.Frame(data, names=["A"]) d0.to_jay(tempfile) del d0 d1 = dt.open(tempfile) # Do not check d1! we want it to be lazy at this point d2 = d1[f[0] == 1, :] frame_integrity_check(d2) assert d2.shape == (n / 8, 1)
def test_empty_string_col(): """ Test that Frame with an empty string column can be saved/opened. See #604 """ dir1 = tempfile.mkdtemp() dt0 = dt.Frame([[1, 2, 3], ["", "", ""]]) dt0.save(dir1) dt1 = dt.open(dir1) assert_equals(dt0, dt1) shutil.rmtree(dir1)
def test_save_view(tempdir): dt0 = dt.Frame([4, 0, -2, 3, 17, 2, 0, 1, 5], names=["fancy"]) dt1 = dt0.sort(0) assert dt1.internal.isview dt1.internal.check() dt1.save(tempdir, format="nff") dt2 = dt.open(tempdir) assert not dt2.internal.isview dt2.internal.check() assert dt2.names == dt1.names assert dt2.to_list() == dt1.to_list()
def test_empty_string_col(): """ Test that Frame with an empty string column can be saved/opened. See #604 """ dir1 = tempfile.mkdtemp() dt0 = dt.Frame([[1, 2, 3], ["", "", ""]]) with pytest.warns(FutureWarning): dt0.save(dir1, format="nff") dt1 = dt.open(dir1) assert_equals(dt0, dt1) shutil.rmtree(dir1)
def test_save_view(tempdir): dt0 = dt.Frame([4, 0, -2, 3, 17, 2, 0, 1, 5], names=["fancy"]) dt1 = dt0.sort(0) assert isview(dt1) frame_integrity_check(dt1) with pytest.warns(FutureWarning): dt1.save(tempdir, format="nff") dt2 = dt.open(tempdir) assert not isview(dt2) frame_integrity_check(dt2) assert dt2.names == dt1.names assert dt2.to_list() == dt1.to_list()
def test_jay_object_columns(tempfile): src1 = [1, 2, 3, 4] src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}] d0 = dt.Frame([src1, src2], names=["A", "B"]) assert d0.stypes == (dt.int32, dt.obj64) with pytest.warns(DatatableWarning) as ws: d0.to_jay(tempfile) assert len(ws) == 1 assert "Column `B` of type obj64 was not saved" in ws[0].message.args[0] d1 = dt.open(tempfile) frame_integrity_check(d1) assert d1.names == ("A", ) assert d1.to_list() == [src1]
def test_jay_object_columns(tempfile): src1 = [1, 2, 3, 4] src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}] d0 = dt.Frame([src1, src2], names=["A", "B"]) assert d0.stypes == (dt.int8, dt.obj64) with pytest.warns(DatatableWarning) as ws: d0.save(tempfile, format="jay") assert len(ws) == 1 assert "Column `B` of type obj64 was not saved" in ws[0].message.args[0] d1 = dt.open(tempfile) d1.internal.check() assert d1.names == ("A",) assert d1.topython() == [src1]
def test_jay_view(tempfile, seed): random.seed(seed) src = [random.normalvariate(0, 1) for n in range(1000)] dt0 = dt.Frame({"values": src}) dt1 = dt0.sort(0) assert isview(dt1) dt1.to_jay(tempfile) assert os.path.isfile(tempfile) dt2 = dt.open(tempfile) assert not isview(dt2) frame_integrity_check(dt1) frame_integrity_check(dt2) assert dt1.names == dt2.names assert dt1.stypes == dt2.stypes assert dt1.to_list() == dt2.to_list()
def test_jay_view(tempfile, seed): random.seed(seed) src = [random.normalvariate(0, 1) for n in range(1000)] dt0 = dt.Frame({"values": src}) dt1 = dt0.sort(0) assert dt1.internal.isview dt1.save(tempfile, format="jay") assert os.path.isfile(tempfile) dt2 = dt.open(tempfile) assert not dt2.internal.isview dt1.internal.check() dt2.internal.check() assert dt1.names == dt2.names assert dt1.stypes == dt2.stypes assert dt1.topython() == dt2.topython()
def test_obj_columns(tempdir): src1 = [1, 2, 3, 4] src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}] d0 = dt.Frame([src1, src2], names=["A", "B"]) d0.internal.check() assert d0.ltypes == (dt.ltype.int, dt.ltype.obj) assert d0.shape == (4, 2) with pytest.warns(DatatableWarning) as ws: d0.save(tempdir) assert len(ws) == 1 assert "Column 'B' of type obj64 was not saved" in ws[0].message.args[0] del d0 d1 = dt.open(tempdir) d1.internal.check() assert d1.shape == (4, 1) assert d1.names == ("A", ) assert d1.topython() == [src1]
def test_obj_columns(tempdir): src1 = [1, 2, 3, 4] src2 = [(2, 3), (5, 6, 7), 9, {"A": 3}] d0 = dt.Frame([src1, src2], names=["A", "B"]) frame_integrity_check(d0) assert d0.ltypes == (dt.ltype.int, dt.ltype.obj) assert d0.shape == (4, 2) with pytest.warns(DatatableWarning) as ws: d0.save(tempdir, format="nff") assert len(ws) == 2 assert "Method `Frame.save()` is deprecated" in ws[0].message.args[0] assert "Column 'B' of type obj64 was not saved" in ws[1].message.args[0] del d0 d1 = dt.open(tempdir) frame_integrity_check(d1) assert d1.shape == (4, 1) assert d1.names == ("A", ) assert d1.to_list() == [src1]
def test_jay_all_types(tempfile): d0 = dt.Frame( [[True, False, None, True, True], [None, 1, -9, 12, 3], [4, 1346, 999, None, None], [591, 0, None, -395734, 19384709], [None, 777, 1093487019384, -384, None], [2.987, 3.45e-24, -0.189134e+12, 45982.1, None], [39408.301, 9.459027045e-125, 4.4508e+222, None, 3.14159], ["Life", "Liberty", "and", "Pursuit of Happiness", None], ["кохайтеся", "чорнобриві", ",", "та", "не з москалями"]], stypes=[ dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64, dt.str32, dt.str64 ], names=["b8", "i8", "i16", "i32", "i64", "f32", "f64", "s32", "s64"]) # Force calculation of mins and maxs, so that they get saved into Jay d0.min(), d0.max() assert len(set(d0.stypes)) == d0.ncols d0.save(tempfile, format="jay") assert os.path.isfile(tempfile) d1 = dt.open(tempfile) assert_equals(d0, d1)
git = pd.__git_version__ task = "groupby" solution = "pandas" fun = ".groupby" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name + ".jay") print("loading dataset %s" % data_name, flush=True) ## failed attempt to improve read_csv to read 1e9 rows #99 #dtype = {"id1": "category", "id2": "category", "id3": "category", "id4": "int32", "id5": "int32", "id6": "int32", "v1": "int32", "v2": "int32", "v3": "float64"} #x = pd.read_csv(src_grp, dtype=dtype, engine="c", low_memory=True) import datatable as dt # for loading data only, see #47 x = dt.open(src_grp).to_pandas() x['id1'] = x['id1'].astype('category') # remove after datatable#1691 x['id2'] = x['id2'].astype('category') x['id3'] = x['id3'].astype('category') print(len(x.index), flush=True) print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1']).agg({'v1': 'sum'}) ans.reset_index(inplace=True) # #68 print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage()
exec(open("./helpers.py").read()) ver = pd.__version__ git = pd.__git_version__ task = "groupby" solution = "pandas" fun = ".groupby" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name+".jay") print("loading dataset %s" % data_name, flush=True) import datatable as dt # for loading data only, see #47 x = dt.open(src_grp).to_pandas() x['id1'] = x['id1'].astype('category') x['id2'] = x['id2'].astype('category') x['id3'] = x['id3'].astype('category') print(len(x.index), flush=True) print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1']).agg({'v1':'sum'}) ans.reset_index(inplace=True) # #68 print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage()