def test_fread_columns_set_bad(): with pytest.warns(UserWarning) as ws: dt.fread(text="A,B,C\n1,2,3", columns={"A", "foo"}) assert len(ws) == 1 assert "Column(s) ['foo'] not found in the input" in ws[0].message.args[0]
def test_fread_columns_empty(columns): # empty column selector should select all columns d0 = dt.fread("A,B,C\n1,2,3", columns=columns) assert d0.shape == (1, 3) assert d0.names == ("A", "B", "C") assert d0.topython() == [[1], [2], [3]]
def test_fread_columns_list_bad1(): with pytest.raises(ValueError) as e: dt.fread(text="C1,C2\n1,2\n3,4\n", columns=["C2"]) assert ("Input file contains 2 columns, whereas `columns` parameter " "specifies only 1 column" in str(e.value))
def test_fread_columns_list_bad3(): with pytest.raises(ValueError) as e: dt.fread(text="C1,C2\n1,2", columns=["C1", ("C2", bytes)]) assert "Unknown type <class 'bytes'> used as an override for column 'C2'" \ in str(e)
def test_fread_columns_range_bad2(): with pytest.raises(ValueError) as e: dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=range(13)) assert "Invalid range iterator" in str(e.value)
def test_fread_columns_list3(): d0 = dt.fread(text="A,B,C\n1,2,3", columns=[("foo", str), None, None]) assert d0.internal.check() assert d0.names == ("foo", ) assert d0.topython() == [["1"]]
def test_fread_bad_source_none(): with pytest.raises(ValueError) as e: dt.fread() assert "No input source" in str(e)
from datatable import f, sum, mean, count, sd, min, max, by, sort exec(open("./helpers.py").read()) ver = dt.__version__ git = dt.__git_revision__ task = "groupby" solution = "pydatatable" fun = "[.datatable" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name, flush=True) x = dt.fread(src_grp) print(x.nrows, flush=True) print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
def test_fread_file_not_exists(): name = "qerubvwpif8rAIB9845gb1_" path = os.path.abspath(".") with pytest.raises(ValueError) as e: dt.fread(name) assert ("File %s`/%s` does not exist" % (path, name)) in str(e)
def test_fread_file_is_directory(): path = os.path.abspath(".") with pytest.raises(ValueError) as e: dt.fread(path) assert ("Path `%s` is not a file" % path) in str(e)
import pandas as pd import datatable as dt import base64 from io import StringIO str_data = 'a,b,c\n1,2,3\nTrue,False,True' # read from string using StringIO s = StringIO() s.write(str_data) s.seek(0) # move pointer to the beginning df = pd.read_csv(s) # directly read from string to datatable dt_df = dt.Frame(str_data) # encode into jay jay_data = dt_df.to_jay() print(jay_data) # dt_df = dt.fread(jay_data) b64_data = base64.b64encode(jay_data) print(b64_data) deb64_data = base64.b64decode(b64_data) print(deb64_data) dt_df = dt.fread(deb64_data) print(dt_df)
def test_write_spacenames(): d = dt.Frame([[1, 2, 3], [1, 2, 3], [0, 0, 0]], names=[" foo", "bar ", " "]) assert d.to_csv() == '" foo","bar "," "\n1,1,0\n2,2,0\n3,3,0\n' dd = dt.fread(text=d.to_csv()) assert d.to_list() == dd.to_list()
import pandas as pd import pprint try: myclient = pymongo.MongoClient("mongodb://localhost:27017/") except: print("Could not connect to Mongo") try: mydb = myclient["kaggle"] except: print("Could not open database kaggle") mycollection = mydb['testje'] mf = dt.fread("movies.csv") print(mf.head()) print(mf.shape) print(mf.names) nf = mf.to_numpy() pf = mf.to_pandas() lf = mf.to_list() print(type(mf)) print(type(nf)) print(type(pf)) data = pf.to_dict(orient='records') mycollection.insert_many(data)
def test_fread_from_url2(): path = os.path.abspath("LICENSE") d0 = dt.fread("file://" + path, sep="\n") assert d0.internal.check() assert d0.shape == (372, 1)
def test_fread_bad_source_any_and_source(): with pytest.raises(ValueError) as e: dt.fread("a", text="b") assert "When an unnamed argument is passed, it is invalid to also " \ "provide the `text` parameter" in str(e)
def fun(nGPUs=1, nFolds=1, nLambdas=100, nAlphas=8, classification=False, use_seed=True, validFraction=0.0): name = str(sys._getframe().f_code.co_name) name = str(sys._getframe(1).f_code.co_name) t = time.time() print("cwd: %s" % (os.getcwd())) sys.stdout.flush() if nGPUs > 0: use_gpu = True else: use_gpu = False display = 1 write = 1 # seed = np.random.randint(0, 2 ** 31 - 1) seed = 1034753 if use_seed else None print("Reading Data") if 1 == 0: # not yet t1 = time.time() target = None import datatable as dt # omp problem in pycharm train = find_file("./testsbig/data/xtrain.txt") test = find_file("./testsbig/data/xtest.txt") train = os.path.normpath(os.path.join(os.getcwd(), train)) train_df = dt.fread(train).topandas() train_df = train_df[pd.notnull(train_df[target])].reset_index(drop=True) # drop rows with NA response test = os.path.normpath(os.path.join(os.getcwd(), test)) test_df = dt.fread(test).topandas() test_df = test_df[pd.notnull(test_df[target])].reset_index(drop=True) # drop rows with NA response y = train_df[target] df_before = copy.deepcopy(train_df) classes = 1 if not classification else len(y.unique()) print("Testing GLM for " + ((str(classes) + "-class classification") if classes >= 2 else "regression")) else: if 1 == 1: # avoid for now so get info # should all be explicitly np.float32 or all np.float64 xtrain = np.loadtxt("./data/xtrainhyatt.csv", delimiter=',', dtype=np.float32) ytrain = np.loadtxt("./data/ytrainhyatt.csv", delimiter=',', dtype=np.float32) xtest = np.loadtxt("./data/xtesthyatt.csv", delimiter=',', dtype=np.float32) ytest = np.loadtxt("./data/ytesthyatt.csv", delimiter=',', dtype=np.float32) wtrain = np.ones((xtrain.shape[0], 1), dtype=np.float32) t1 = time.time() pred_val, rmse_train, rmse_test = runglm(nFolds, nAlphas, nLambdas, xtrain, ytrain, xtest, ytest, wtrain, write, display, use_gpu, name=name) else: xfull = np.loadtxt("./data/xtrainhyatt.csv", delimiter=',', dtype=np.float32) yfull = np.loadtxt("./data/ytrainhyatt.csv", delimiter=',', dtype=np.float32) t1 = time.time() rmse_train, rmse_test = elastic_net(xfull, yfull, nGPUs=nGPUs, nlambda=nLambdas, nfolds=nFolds, nalpha=nAlphas, validFraction=validFraction, verbose=0, name=name) print("Testing GLM") # check rmse print(rmse_train[0, 0]) print(rmse_train[0, 1]) print(rmse_train[0, 2]) print(rmse_test[0, 2]) sys.stdout.flush() # FIXME: But these below should really be order 1 to 1.5 according to Wamsi! assert rmse_train[0, 0] < 20 assert rmse_train[0, 1] < 20 assert rmse_train[0, 2] < 31 assert rmse_test[0, 2] < 31 print('/n Total execution time:%d' % (time.time() - t1)) print("TEST PASSED") sys.stdout.flush() print("Time taken: {}".format(time.time() - t)) # endfunnel(pipes) print("DONE.") sys.stdout.flush()
def test_fread_from_file2(): with pytest.raises(ValueError): dt.fread(file="a,b\n1,2")
def test_fread_columns_range_bad1(): with pytest.raises(ValueError) as e: dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=range(3, 0, -1)) assert "Cannot use slice/range with negative step" in str(e.value)
def test_fread_bad_source_2sources(): with pytest.raises(ValueError) as e: dt.fread(file="a", text="b") assert "Both parameters `file` and `text` cannot be passed to fread " \ "simultaneously" in str(e)
def test_fread_columns_list2(): d0 = dt.fread(text="A,B,C\n1,2,3", columns=["foo", None, "baz"]) assert d0.internal.check() assert d0.names == ("foo", "baz") assert d0.topython() == [[1], [3]]
def test_fread_bad_source_anysource(): with pytest.raises(TypeError) as e: dt.fread(12345) assert "Unknown type for the first argument in fread" in str(e)
def test_fread_from_cmd1(): d0 = dt.fread(cmd="ls -l") assert d0.internal.check()
def test_fread_bad_source_text(): with pytest.raises(TypeError) as e: dt.fread(text=["a", "b", "c"]) assert "Invalid parameter `text` in fread: expected str or bytes" in str(e)
def test_fread_columns_list_bad2(): with pytest.raises(TypeError): dt.fread(text="C1,C2\n1,2\n3,4\n", columns=["C1", 2])
def test_fread_bad_source_file(): with pytest.raises(TypeError) as e: dt.fread(file=TypeError) assert ("Invalid parameter `file` in fread: expected a str/bytes/PathLike" in str(e))
def test_fread_columns_set1(): text = ("C1,C2,C3,C4\n" "1,3.3,7,\"Alice\"\n" "2,,,\"Bob\"") d0 = dt.fread(text=text, columns={"C1", "C3"}) assert d0.internal.check() assert d0.names == ("C1", "C3") assert d0.topython() == [[1, 2], [7, None]]
def test_fread_bad_source_cmd(): with pytest.raises(TypeError) as e: dt.fread(cmd=["ls", "-l", ".."]) assert "Invalid parameter `cmd` in fread: expected str" in str(e)
def test_fread_columns_dict2(): d0 = dt.fread(text="A,B,C,D\n1,2,3,4", columns={"A": "a", ...: None}) assert d0.names == ("a", ) assert d0.topython() == [[1]]
def test_fread_from_text1(): d0 = dt.fread(text="A") assert d0.internal.check() assert d0.names == ("A", ) assert d0.shape == (0, 1)
def test_fread_from_url1(): with pytest.raises(ValueError) as e: dt.fread(url="A") assert "unknown url type" in str(e)
def test_fread_columns_slice(): d0 = dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=slice(None, None, 2)) assert d0.internal.check() assert d0.names == ("A", "C", "E") assert d0.topython() == [[1], [3], [5]]
git = dt.__git_revision__ task = "read" data_name = os.path.basename(src_grp) solution = "pydatatable" fun = "fread" cache = "TRUE" wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0] in_rows = int(wc_lines)-1 print("reading...") question = "all rows" #1 gc.collect() t_start = timeit.default_timer() ans = dt.fread(data_name, show_progress=False) print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v3)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = dt.fread(data_name, show_progress=False) print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
def test_fread_columns_range(): d0 = dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=range(3)) assert d0.internal.check() assert d0.names == ("A", "B", "C") assert d0.topython() == [[1], [2], [3]]
src_x = os.environ['SRC_X_LOCAL'] src_y = os.environ['SRC_Y_LOCAL'] ver = dt.__version__ git = dt.__git_revision__ task = "join" question = "inner join" l = [os.path.basename(src_x), os.path.basename(src_y)] data_name = '-'.join(l) solution = "pydatatable" fun = "merge" cache = "TRUE" print("loading datasets...") x = dt.fread(os.path.basename(src_x)) y = dt.fread(os.path.basename(src_y)) print("joining...") gc.collect() t_start = timeit.default_timer() ans = x.merge(y, how='inner', on='KEY') print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.X2), sum(f.Y2)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans
exec(open("./helpers.py").read()) src_x = os.environ['SRC_X_LOCAL'] ver = dt.__version__ git = dt.__git_revision__ task = "sort" question = "by int KEY" data_name = os.path.basename(src_x) solution = "pydatatable" fun = ".sort" cache = "TRUE" print("loading dataset...") x = dt.fread(data_name) print("sorting...") gc.collect() t_start = timeit.default_timer() ans = x.sort('KEY') print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.X2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans