def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = DaskDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def _test_nested(): # TODO: nested type doesn't work in dask # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] # df = DaskDataFrame(data, "a:{a:str,b:[int]}") # a = df.as_array(type_safe=True) # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = DaskDataFrame(data, "a:[{a:str,b:[int]}]") a = df.as_array(type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a
def test_as_array(): df = DaskDataFrame([], "a:str,b:int") assert [] == df.as_array() assert [] == df.as_array(type_safe=True) assert [] == list(df.as_array_iterable()) assert [] == list(df.as_array_iterable(type_safe=True)) df = DaskDataFrame([["a", 1]], "a:str,b:int") assert [["a", 1]] == df.as_array() assert [["a", 1]] == df.as_array(["a", "b"]) assert [[1, "a"]] == df.as_array(["b", "a"]) # prevent pandas auto type casting df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array() assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int) assert [[1.0, 1]] == df.as_array(["a", "b"]) assert [[1, 1.0]] == df.as_array(["b", "a"]) df = DaskDataFrame([[np.float64(1.0), 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array() assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[pandas.Timestamp("2020-01-01"), 1.1]], "a:datetime,b:int") df.native["a"] = pd.to_datetime(df.native["a"]) assert [[datetime(2020, 1, 1), 1]] == df.as_array() assert isinstance(df.as_array()[0][0], datetime) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[pandas.NaT, 1.1]], "a:datetime,b:int") df.native["a"] = pd.to_datetime(df.native["a"]) assert isinstance(df.as_array()[0][0], datetime) assert isinstance(df.as_array()[0][1], int) df = DaskDataFrame([[1.0, 1.1]], "a:double,b:int") assert [[1.0, 1]] == df.as_array(type_safe=True) assert isinstance(df.as_array()[0][0], float) assert isinstance(df.as_array()[0][1], int)