def test_corr_multiple(): DT = dt.Frame(A=[3, 5, 9, 1], B=[4, 7, 0, 0], C=[3, 2, 1, 0], D=range(4)) D1 = DT[:, corr(f.A, f[:])] D2 = DT[:, corr(f[:], f.D)] D3 = DT[:, corr(f[:], f[:])] a = -0.07168504827326534 b = 0.07559289460184544 c = 0.7207110797203374 assert_equals(D1, dt.Frame([[1.0], [a], [b], [-b]])) assert_equals(D2, dt.Frame([[-b], [-c], [-1.0], [1.0]])) assert_equals(D3, dt.Frame([[1.0], [1.0], [1.0], [1.0]]))
def test_corr_random(numpy, seed): numpy.random.seed(seed) arr1 = numpy.random.rand(100) arr2 = numpy.random.rand(100) np_corr = numpy.corrcoef(arr1, arr2)[0, 1] DT = dt.Frame([arr1, arr2]) dt_corr = DT[:, corr(f[0], f[1])][0, 0] assert numpy.isclose(np_corr, dt_corr, atol=1e-12, rtol=1e-12)
def test_corr_with_constant(): DT = dt.Frame(A=range(23), B=[2.5] * 23) D1 = DT[:, corr(f.A, f.B)] assert_equals(D1, dt.Frame([math.nan]))
def test_corr_small_frame(): D1 = dt.Frame(A=[1], B=[2])[:, corr(f.A, f.B)] D2 = dt.Frame(A=[], B=[])[:, corr(f.A, f.B)] assert_equals(D1, dt.Frame([None], stype=dt.float64)) assert_equals(D2, dt.Frame([None], stype=dt.float64))
def test_corr_simple2(): DT = dt.Frame(A=range(5), B=range(5, 0, -1)) D1 = DT[:, corr(f.A, f.B)] assert_equals(D1, dt.Frame([-1.0]))
fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans question = 'regression v1 v2 by id2 id4' # q9 gc.collect() t_start = timeit.default_timer() ans = x[:, {'r2': corr(f.v1, f.v2)**2}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.r2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
def test_corr_with_constant(): DT = dt.Frame(A=range(23), B=[2.5] * 23) D1 = DT[:, corr(f.A, f.B)] assert_equals(D1, dt.Frame([None], type=float))
ans = x[:2, {"largest2_v3": f.v3}, by(f.id6), sort(-f.v3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.largest2_v3)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3).to_pandas(), flush=True) print(ans.tail(3).to_pandas(), flush=True) del ans question = "regression v1 v2 by id2 id4" # q9 # not yet implemeneted https://github.com/h2oai/datatable/issues/1543 gc.collect() t_start = timeit.default_timer() ans = x[:, {"r2": corr(f.v1, f.v2)**2}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.r2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"r2": corr(f.v1, f.v2)**2}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()