Ejemplo n.º 1
0
def test_fread_columns_set_bad():
    with pytest.warns(UserWarning) as ws:
        dt.fread(text="A,B,C\n1,2,3", columns={"A", "foo"})
    assert len(ws) == 1
    assert "Column(s) ['foo'] not found in the input" in ws[0].message.args[0]
Ejemplo n.º 2
0
def test_fread_columns_empty(columns):
    # empty column selector should select all columns
    d0 = dt.fread("A,B,C\n1,2,3", columns=columns)
    assert d0.shape == (1, 3)
    assert d0.names == ("A", "B", "C")
    assert d0.topython() == [[1], [2], [3]]
Ejemplo n.º 3
0
def test_fread_columns_list_bad1():
    with pytest.raises(ValueError) as e:
        dt.fread(text="C1,C2\n1,2\n3,4\n", columns=["C2"])
    assert ("Input file contains 2 columns, whereas `columns` parameter "
            "specifies only 1 column" in str(e.value))
Ejemplo n.º 4
0
def test_fread_columns_list_bad3():
    with pytest.raises(ValueError) as e:
        dt.fread(text="C1,C2\n1,2", columns=["C1", ("C2", bytes)])
    assert "Unknown type <class 'bytes'> used as an override for column 'C2'" \
           in str(e)
Ejemplo n.º 5
0
def test_fread_columns_range_bad2():
    with pytest.raises(ValueError) as e:
        dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=range(13))
    assert "Invalid range iterator" in str(e.value)
Ejemplo n.º 6
0
def test_fread_columns_list3():
    d0 = dt.fread(text="A,B,C\n1,2,3", columns=[("foo", str), None, None])
    assert d0.internal.check()
    assert d0.names == ("foo", )
    assert d0.topython() == [["1"]]
Ejemplo n.º 7
0
def test_fread_bad_source_none():
    with pytest.raises(ValueError) as e:
        dt.fread()
    assert "No input source" in str(e)
Ejemplo n.º 8
0
from datatable import f, sum, mean, count, sd, min, max, by, sort

exec(open("./helpers.py").read())

ver = dt.__version__
git = dt.__git_revision__
task = "groupby"
solution = "pydatatable"
fun = "[.datatable"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name, flush=True)

x = dt.fread(src_grp)
print(x.nrows, flush=True)

print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
Ejemplo n.º 9
0
def test_fread_file_not_exists():
    name = "qerubvwpif8rAIB9845gb1_"
    path = os.path.abspath(".")
    with pytest.raises(ValueError) as e:
        dt.fread(name)
    assert ("File %s`/%s` does not exist" % (path, name)) in str(e)
Ejemplo n.º 10
0
def test_fread_file_is_directory():
    path = os.path.abspath(".")
    with pytest.raises(ValueError) as e:
        dt.fread(path)
    assert ("Path `%s` is not a file" % path) in str(e)
Ejemplo n.º 11
0
import pandas as pd
import datatable as dt

import base64
from io import StringIO

str_data = 'a,b,c\n1,2,3\nTrue,False,True'

# read from string using StringIO
s = StringIO()
s.write(str_data)
s.seek(0) # move pointer to the beginning

df = pd.read_csv(s)

# directly read from string to datatable
dt_df = dt.Frame(str_data)

# encode into jay
jay_data = dt_df.to_jay()
print(jay_data)
# dt_df = dt.fread(jay_data)

b64_data = base64.b64encode(jay_data)
print(b64_data)

deb64_data = base64.b64decode(b64_data)
print(deb64_data)

dt_df = dt.fread(deb64_data)
print(dt_df)
Ejemplo n.º 12
0
def test_write_spacenames():
    d = dt.Frame([[1, 2, 3], [1, 2, 3], [0, 0, 0]],
                 names=["  foo", "bar ", " "])
    assert d.to_csv() == '"  foo","bar "," "\n1,1,0\n2,2,0\n3,3,0\n'
    dd = dt.fread(text=d.to_csv())
    assert d.to_list() == dd.to_list()
Ejemplo n.º 13
0
import pandas as pd
import pprint

try:
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
except:
    print("Could not connect to Mongo")

try:
    mydb = myclient["kaggle"]
except:
    print("Could not open database kaggle")

mycollection = mydb['testje']

mf = dt.fread("movies.csv")
print(mf.head())
print(mf.shape)
print(mf.names)

nf = mf.to_numpy()
pf = mf.to_pandas()
lf = mf.to_list()

print(type(mf))
print(type(nf))
print(type(pf))

data = pf.to_dict(orient='records')
mycollection.insert_many(data)
Ejemplo n.º 14
0
def test_fread_from_url2():
    path = os.path.abspath("LICENSE")
    d0 = dt.fread("file://" + path, sep="\n")
    assert d0.internal.check()
    assert d0.shape == (372, 1)
Ejemplo n.º 15
0
def test_fread_bad_source_any_and_source():
    with pytest.raises(ValueError) as e:
        dt.fread("a", text="b")
    assert "When an unnamed argument is passed, it is invalid to also " \
           "provide the `text` parameter" in str(e)
Ejemplo n.º 16
0
def fun(nGPUs=1, nFolds=1, nLambdas=100, nAlphas=8, classification=False, use_seed=True, validFraction=0.0):
    name = str(sys._getframe().f_code.co_name)
    name = str(sys._getframe(1).f_code.co_name)
    t = time.time()

    print("cwd: %s" % (os.getcwd()))
    sys.stdout.flush()

    if nGPUs > 0:
        use_gpu = True
    else:
        use_gpu = False

    display = 1
    write = 1

    # seed = np.random.randint(0, 2 ** 31 - 1)
    seed = 1034753 if use_seed else None

    print("Reading Data")
    if 1 == 0:  # not yet
        t1 = time.time()
        target = None
        import datatable as dt  # omp problem in pycharm
        train = find_file("./testsbig/data/xtrain.txt")
        test = find_file("./testsbig/data/xtest.txt")

        train = os.path.normpath(os.path.join(os.getcwd(), train))
        train_df = dt.fread(train).topandas()
        train_df = train_df[pd.notnull(train_df[target])].reset_index(drop=True)  # drop rows with NA response

        test = os.path.normpath(os.path.join(os.getcwd(), test))
        test_df = dt.fread(test).topandas()
        test_df = test_df[pd.notnull(test_df[target])].reset_index(drop=True)  # drop rows with NA response

        y = train_df[target]

        df_before = copy.deepcopy(train_df)

        classes = 1 if not classification else len(y.unique())
        print("Testing GLM for " + ((str(classes) + "-class classification") if classes >= 2 else "regression"))
    else:
        if 1 == 1:  # avoid for now so get info
            # should all be explicitly np.float32 or all np.float64
            xtrain = np.loadtxt("./data/xtrainhyatt.csv", delimiter=',', dtype=np.float32)
            ytrain = np.loadtxt("./data/ytrainhyatt.csv", delimiter=',', dtype=np.float32)
            xtest = np.loadtxt("./data/xtesthyatt.csv", delimiter=',', dtype=np.float32)
            ytest = np.loadtxt("./data/ytesthyatt.csv", delimiter=',', dtype=np.float32)
            wtrain = np.ones((xtrain.shape[0], 1), dtype=np.float32)

            t1 = time.time()
            pred_val, rmse_train, rmse_test = runglm(nFolds, nAlphas, nLambdas, xtrain, ytrain, xtest, ytest, wtrain,
                                                     write, display, use_gpu, name=name)
        else:
            xfull = np.loadtxt("./data/xtrainhyatt.csv", delimiter=',', dtype=np.float32)
            yfull = np.loadtxt("./data/ytrainhyatt.csv", delimiter=',', dtype=np.float32)

            t1 = time.time()
            rmse_train, rmse_test = elastic_net(xfull, yfull, nGPUs=nGPUs, nlambda=nLambdas, nfolds=nFolds,
                                                nalpha=nAlphas,
                                                validFraction=validFraction, verbose=0, name=name)
        print("Testing GLM")

    # check rmse
    print(rmse_train[0, 0])
    print(rmse_train[0, 1])
    print(rmse_train[0, 2])
    print(rmse_test[0, 2])
    sys.stdout.flush()

    # FIXME: But these below should really be order 1 to 1.5 according to Wamsi!
    assert rmse_train[0, 0] < 20
    assert rmse_train[0, 1] < 20
    assert rmse_train[0, 2] < 31
    assert rmse_test[0, 2] < 31

    print('/n Total execution time:%d' % (time.time() - t1))

    print("TEST PASSED")
    sys.stdout.flush()

    print("Time taken: {}".format(time.time() - t))
    #    endfunnel(pipes)
    print("DONE.")
    sys.stdout.flush()
Ejemplo n.º 17
0
def test_fread_from_file2():
    with pytest.raises(ValueError):
        dt.fread(file="a,b\n1,2")
Ejemplo n.º 18
0
def test_fread_columns_range_bad1():
    with pytest.raises(ValueError) as e:
        dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=range(3, 0, -1))
    assert "Cannot use slice/range with negative step" in str(e.value)
Ejemplo n.º 19
0
def test_fread_bad_source_2sources():
    with pytest.raises(ValueError) as e:
        dt.fread(file="a", text="b")
    assert "Both parameters `file` and `text` cannot be passed to fread " \
           "simultaneously" in str(e)
Ejemplo n.º 20
0
def test_fread_columns_list2():
    d0 = dt.fread(text="A,B,C\n1,2,3", columns=["foo", None, "baz"])
    assert d0.internal.check()
    assert d0.names == ("foo", "baz")
    assert d0.topython() == [[1], [3]]
Ejemplo n.º 21
0
def test_fread_bad_source_anysource():
    with pytest.raises(TypeError) as e:
        dt.fread(12345)
    assert "Unknown type for the first argument in fread" in str(e)
Ejemplo n.º 22
0
def test_fread_from_cmd1():
    d0 = dt.fread(cmd="ls -l")
    assert d0.internal.check()
Ejemplo n.º 23
0
def test_fread_bad_source_text():
    with pytest.raises(TypeError) as e:
        dt.fread(text=["a", "b", "c"])
    assert "Invalid parameter `text` in fread: expected str or bytes" in str(e)
Ejemplo n.º 24
0
def test_fread_columns_list_bad2():
    with pytest.raises(TypeError):
        dt.fread(text="C1,C2\n1,2\n3,4\n", columns=["C1", 2])
Ejemplo n.º 25
0
def test_fread_bad_source_file():
    with pytest.raises(TypeError) as e:
        dt.fread(file=TypeError)
    assert ("Invalid parameter `file` in fread: expected a str/bytes/PathLike"
            in str(e))
Ejemplo n.º 26
0
def test_fread_columns_set1():
    text = ("C1,C2,C3,C4\n" "1,3.3,7,\"Alice\"\n" "2,,,\"Bob\"")
    d0 = dt.fread(text=text, columns={"C1", "C3"})
    assert d0.internal.check()
    assert d0.names == ("C1", "C3")
    assert d0.topython() == [[1, 2], [7, None]]
Ejemplo n.º 27
0
def test_fread_bad_source_cmd():
    with pytest.raises(TypeError) as e:
        dt.fread(cmd=["ls", "-l", ".."])
    assert "Invalid parameter `cmd` in fread: expected str" in str(e)
Ejemplo n.º 28
0
def test_fread_columns_dict2():
    d0 = dt.fread(text="A,B,C,D\n1,2,3,4", columns={"A": "a", ...: None})
    assert d0.names == ("a", )
    assert d0.topython() == [[1]]
Ejemplo n.º 29
0
def test_fread_from_text1():
    d0 = dt.fread(text="A")
    assert d0.internal.check()
    assert d0.names == ("A", )
    assert d0.shape == (0, 1)
Ejemplo n.º 30
0
def test_fread_from_url1():
    with pytest.raises(ValueError) as e:
        dt.fread(url="A")
    assert "unknown url type" in str(e)
Ejemplo n.º 31
0
def test_fread_columns_slice():
    d0 = dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=slice(None, None, 2))
    assert d0.internal.check()
    assert d0.names == ("A", "C", "E")
    assert d0.topython() == [[1], [3], [5]]
Ejemplo n.º 32
0
git = dt.__git_revision__
task = "read"
data_name = os.path.basename(src_grp)
solution = "pydatatable"
fun = "fread"
cache = "TRUE"

wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
in_rows = int(wc_lines)-1

print("reading...")

question = "all rows" #1
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
Ejemplo n.º 33
0
def test_fread_columns_range():
    d0 = dt.fread(text="A,B,C,D,E\n1,2,3,4,5", columns=range(3))
    assert d0.internal.check()
    assert d0.names == ("A", "B", "C")
    assert d0.topython() == [[1], [2], [3]]
Ejemplo n.º 34
0
src_x = os.environ['SRC_X_LOCAL']
src_y = os.environ['SRC_Y_LOCAL']

ver = dt.__version__
git = dt.__git_revision__
task = "join"
question = "inner join"
l = [os.path.basename(src_x), os.path.basename(src_y)]
data_name = '-'.join(l)
solution = "pydatatable"
fun = "merge"
cache = "TRUE"

print("loading datasets...")

x = dt.fread(os.path.basename(src_x))
y = dt.fread(os.path.basename(src_y))

print("joining...")

gc.collect()
t_start = timeit.default_timer()
ans = x.merge(y, how='inner', on='KEY')
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.X2), sum(f.Y2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
Ejemplo n.º 35
0
exec(open("./helpers.py").read())

src_x = os.environ['SRC_X_LOCAL']

ver = dt.__version__
git = dt.__git_revision__
task = "sort"
question = "by int KEY"
data_name = os.path.basename(src_x)
solution = "pydatatable"
fun = ".sort"
cache = "TRUE"

print("loading dataset...")

x = dt.fread(data_name)

print("sorting...")

gc.collect()
t_start = timeit.default_timer()
ans = x.sort('KEY')
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.X2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans