def test(): sr = pd.Series([1, 2, 3]) array = np.asarray(sr) x = np.array([1, 2, 3]) assert np.array_equal(array, x) y = np.array([4, 5, 6]) z = np.add(sr, y) assert np.array_equal(z, x + y) df = pd.DataFrame({"x": x, "y": y}) z = np.add(df["x"], df["y"]) assert np.array_equal(z, x + y) return
"c1": c1, "c3": c3_l, "key1": np.array(key_left, dtype=key_dtype1), "key2": np.array(key_left, dtype=key_dtype2), } ) df2 = pd.DataFrame( { "c2": c2, "c3": c3_r, "key1": np.array(key_right, dtype=key_dtype1), "key2": np.array(key_right, dtype=key_dtype2), } ) ldf1 = lp.DataFrame(df1) ldf2 = lp.DataFrame(df2) join_pandas = df1.merge(df2, on=keys) join_legate = ldf1.merge(ldf2, on=keys, method="broadcast") join_legate_hash = ldf1.merge(ldf2, on=keys, method="hash") assert sort_and_compare(join_pandas, to_pandas(join_legate)) assert sort_and_compare(join_pandas, to_pandas(join_legate_hash)) key_left = list(chain(*[[x] * 3 for x in range(n // 3, 0, -1)])) for pair in product(key_dtypes, key_dtypes[1:] + key_dtypes[:1]): key_dtype1, key_dtype2 = pair print( "Type: left, Size: %u, Key dtype1: %s, Key dtype2: %s " % (n, str(key_dtype1), str(key_dtype2))
# Copyright 2021 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import pandas as pd from legate import pandas as lp df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) ldf = lp.DataFrame(df) df["col3"] = df["col1"] ldf["col3"] = ldf["col1"] assert ldf.equals(lp.DataFrame(df))
# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import pandas as pd from legate import pandas as lp df = pd.DataFrame({ "a": range(10), "b": range(1, 11), "c": [str(i) * 3 for i in range(10)], "d": [str(i % 3) for i in range(10)], }) df["c"] = df["c"].astype(pd.StringDtype()) df["d"] = df["d"].astype("category") ldf = lp.DataFrame(df) assert ldf.tail(2).equals(df.tail(2)) assert ldf.tail(9).equals(df.tail(9))
# See the License for the specific language governing permissions and # limitations under the License. # import pandas as pd from legate import pandas as lp from tests.utils import equals, must_fail def _test(ex, df, *args): must_fail(ex, df.insert, *args) df = pd.DataFrame() ldf = lp.DataFrame() df.insert(0, "a", 1) ldf.insert(0, "a", 1) assert equals(ldf, df) df = pd.DataFrame(index=[1, 2, 3]) ldf = lp.DataFrame(index=[1, 2, 3]) df.insert(0, "a", 1) ldf.insert(0, "a", 1) assert equals(ldf, df) df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # try: from legate.numpy.random import random except ModuleNotFoundError: from numpy.random import random import pandas as pd from legate import pandas as lp x = random(10) y = random(10) df = pd.DataFrame({"x": x, "y": y}) ldf1 = lp.DataFrame({"x": x, "y": y}) # FIXME: We don't handle this case correctly now. DataFrame's ctor # should align all series in the dictionary. # ldf2 = lp.DataFrame({"x": lp.Series(x), "y": lp.Series(y)}) ldf3 = lp.DataFrame(ldf1) assert ldf1.equals(lp.DataFrame(df)) # assert ldf2.equals(lp.DataFrame(df)) assert ldf3.equals(lp.DataFrame(df))
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import pandas as pd from legate import pandas as lp n = 17 a = [str(i) * 3 for i in range(n)] for i in range(n): if i % 4 == 0: a[i] = None s = pd.Series(a) s = s.astype(pd.StringDtype()) ls = lp.DataFrame(s) out_pd = s.dropna() out_lp = ls.dropna() assert out_lp.equals(out_pd)
# limitations under the License. # import pandas as pd from numpy.random import permutation, randn from legate import pandas as lp from tests.utils import equals n = 17 indices = [pd.RangeIndex(1, n + 1), pd.Index(permutation(n))] for index in indices: print(f"Index: {index}") df1 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index) ldf1 = lp.DataFrame(df1) df2 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index) out_pd = df1 + df2 out_lp = ldf1 + df2 assert equals(out_lp, out_pd) out_pd = df1 + df2.values out_lp = ldf1 + df2.values assert equals(out_lp, out_pd) out_pd = df1.add(df2[1].values, axis=0) out_lp = ldf1.add(df2[1].values, axis=0) assert equals(out_lp, out_pd) out_pd = df1.add(df2[1].to_list(), axis=0)
{"c1": c1, "key1": np.array(key_left, dtype=key_dtype), "c3": c3_l} ) df1["key"] = df1["key1"] df1_key_on_index = df1.set_index("key") df2 = pd.DataFrame( { "c2": c2, "key2": np.array(key_right, dtype=key_dtype), "c3": c3_r, } ) df2["key"] = df2["key2"] df2_key_on_index = df2.set_index("key") ldf1 = lp.DataFrame(df1) ldf1_key_on_index = lp.DataFrame(df1_key_on_index) ldf2 = lp.DataFrame(df2) ldf2_key_on_index = lp.DataFrame(df2_key_on_index) join_pandas2 = df1.merge( df2_key_on_index, left_on="key1", right_index=True ) join_pandas4 = df1_key_on_index.merge( df2, right_on="key2", left_index=True ) # XXX: Pandas sort the keys in the output when both left_index and # right_index are True, whereas Legate will not for performance # reasons. In this test we sorted the keys in the input dataframe # so that Pandas' join output coincides with Legate's. We can't # and won't guarantee this semantics equivalence in general.
import pandas as pd from legate import pandas as lp from tests.utils import equals, must_fail def _test(ex, df, *args): def _loc(): df.loc[args] must_fail(ex, _loc) n = 17 for index in [pd.RangeIndex(3, n + 3), pd.Index(list(range(3, n + 3)))]: df_copy = lp.DataFrame({"a": range(n)}, index=index) df = lp.DataFrame({"a": range(n)}, index=index) _test(KeyError, df, n + 3) _test(KeyError, df, n + 4, "a") assert len(df.loc[n + 3:n + 4]) == 0 df.loc[n + 3] = 100 assert equals(df_copy, df) df.loc[n + 3:n + 4] = 200 assert equals(df_copy, df)
pd.RangeIndex(21, 1, -2), pd.Index(permutation(10)), ]: print(f"Index: {index}") df = pd.DataFrame( { "a": range(10), "b": range(1, 11), "c": [str(i) * 3 for i in range(10)], "d": [str(i % 3) for i in range(10)], }, index=index, ) df["c"] = df["c"].astype(pd.StringDtype()) df["d"] = df["d"].astype("category") ldf = lp.DataFrame(df) for idx in range(4): print(f"Testing ldf.iat[{index[idx + 3]}, {idx}].__getitem__") out_pd = df.iat[idx + 3, idx] out_lp = ldf.iat[idx + 3, idx] assert equals_scalar(out_lp, out_pd) for idx, val in enumerate([100, 200, "5678"]): print(f"Testing ldf.iat[{index[idx + 3]}, {idx}].__setitem__") df.iat[idx + 3, idx] = val ldf.iat[idx + 3, idx] = val out_pd = df.iat[idx + 3, idx] out_lp = ldf.iat[idx + 3, idx] assert equals_scalar(out_lp, out_pd)
import pandas as pd from legate import pandas as lp from tests.utils import equals indices = [ pd.RangeIndex(3), pd.RangeIndex(1, 4), pd.RangeIndex(6, step=2), pd.RangeIndex(1, 10, step=3), ] for index in indices: df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=index) ldf = lp.DataFrame(df) # Passing Legate dataframes as arguments assert equals(lp.DataFrame(ldf, dtype="float64"), pd.DataFrame(df, dtype="float64")) assert equals(lp.DataFrame(ldf, columns=["a"]), pd.DataFrame(df, columns=["a"])) assert equals( lp.DataFrame(ldf, columns=["a"], dtype="float64"), pd.DataFrame(df, columns=["a"], dtype="float64"), ) # Passing Legate series as arguments assert equals(lp.DataFrame(ldf["a"]), pd.DataFrame(df["a"])) assert equals( lp.DataFrame(ldf["a"], dtype="float32"),
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import pandas as pd from legate import pandas as lp from tests.utils import equals df = pd.DataFrame() ldf = lp.DataFrame() assert equals(ldf, df) df["a"] = 0 ldf["a"] = 0 assert equals(ldf, df) df = pd.DataFrame(columns=["a", "b"]) ldf = lp.DataFrame(columns=["a", "b"]) assert equals(ldf, df) df.loc[:, "a"] = "1" ldf.loc[:, "a"] = "1"
def test( size_per_proc=1000, num_procs=1, num_runs=1, scale_lhs_only=False, package="legate", ty="int64", key_length=40, pad_side="right", ): if package == "legate": from legate import numpy as np, pandas as pd from legate.numpy.random import randn elif package == "cudf": import cudf as pd import cupy as np from cupy.random import randn elif package == "pandas": import numpy as np import pandas as pd from numpy.random import randn else: print("Unknown dataframe package: %s" % package) assert False if package == "legate": from legate.timing import time def block(): pass def get_timestamp(): return time() def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) / 1000.0 else: import time def block(): pass get_timestamp = time.process_time def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) * 1000.0 size = size_per_proc * num_procs key = np.arange(size, dtype=np.int64) % size_per_proc payload = randn(size) df = pd.DataFrame({"key": key, "payload": payload}) if ty == "int64": df["key"] = df["key"] * -1 ascending = True if ty == "string": df["key"] = ( df["key"] .astype(str) .str.pad(width=key_length, side=pad_side, fillchar="0") ) ascending = False print("Size: %u, Key dtype: %s" % (size, df["key"].dtype)) block() for i in range(num_runs): start_ts = get_timestamp() result = df.sort_values("key", ignore_index=True, ascending=ascending) stop_ts = get_timestamp() print( "[Run %d] Elapsed time: %lf ms" % (i + 1, compute_elapsed_time(start_ts, stop_ts)) ) del result
"key1": np.array(key1, dtype=np.int64), "key2": np.array(key1[::-1], dtype=np.int64), }) df2 = pd.DataFrame({ "c2": c2, "key1": np.array(key2, dtype=np.int64), "key2": np.array(key2[::-1], dtype=np.int64), }) df3 = pd.DataFrame({"c3": c3, "key1": np.array(key3, dtype=np.int64)}) df4 = pd.DataFrame({ "c4": c3, "key1": np.array(key3, dtype=np.int64), "key2": np.array(key3[::-1], dtype=np.int64), }) ldf1 = lp.DataFrame(df1) ldf2 = lp.DataFrame(df2) ldf3 = lp.DataFrame(df3) ldf4 = lp.DataFrame(df4) join_pandas = (df1.merge(df2, on=["key1", "key2"]).merge(df3, on="key1").merge(df4, on="key1")) join_legate = (ldf1.merge(ldf2, on=["key1", "key2"]).merge(ldf3, on="key1").merge(ldf4, on="key1"))
def test( size_per_proc=1000, num_procs=1, num_runs=1, ty="int64", key_length=10, scale_lhs_only=False, package="legate", ): if package == "legate": from legate import numpy as np, pandas as pd from legate.numpy.random import randn elif package == "cudf": import cudf as pd import cupy as np from cupy.random import randn elif package == "pandas": import numpy as np import pandas as pd from numpy.random import randn elif package == "dask" or package == "daskcudf": import dask.array as da import dask.dataframe as df import numpy as np if package == "daskcudf": import cudf else: print("Unknown dataframe package: %s" % package) assert False if package == "legate": from legate.timing import time def block(*args): pass def get_timestamp(): return time() def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) / 1000.0 elif package == "dask" or package == "daskcudf": import time def block(*args): for arg in args: arg.compute() get_timestamp = time.process_time def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) * 1000.0 else: import time def block(*args): pass get_timestamp = time.process_time def compute_elapsed_time(start_ts, stop_ts): return (stop_ts - start_ts) * 1000.0 if scale_lhs_only: size = size_per_proc * num_procs size_rhs = size // 3 if package == "dask" or package == "daskcudf": # Dask array does not have randn so use arrange c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc) c2 = da.arange( size_rhs, dtype=np.float64, chunks=(size_per_proc + num_procs - 1) // num_procs, ) else: c1 = randn(size) c2 = randn(size_rhs) key_dtype = np.int64 if package == "dask" or package == "daskcudf": key_left = ( da.arange(size, dtype=key_dtype, chunks=size_per_proc) % size_per_proc ) key_right = da.arange( size_rhs, dtype=key_dtype, chunks=(size_per_proc + num_procs - 1) // num_procs, ) da.multiply(key_right, 3, out=key_right) else: key_left = np.arange(size, dtype=key_dtype) % size_per_proc key_right = np.arange(size_rhs, dtype=key_dtype) np.multiply(key_right, 3, out=key_right) else: size = size_per_proc * num_procs size_rhs = size if package == "dask" or package == "daskcudf": # Dask array does not have randn so use arrange c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc) c2 = da.arange(size, dtype=np.float64, chunks=size_per_proc) else: c1 = randn(size) c2 = randn(size) key_dtype = np.int64 if package == "dask" or package == "daskcudf": key_left = da.arange(size, dtype=key_dtype, chunks=size_per_proc) key_right = da.arange(size, dtype=key_dtype, chunks=size_per_proc) else: key_left = np.arange(size, dtype=key_dtype) key_right = np.arange(size, dtype=key_dtype) # np.floor_divide(key_right, 3, out=key_right) # np.multiply(key_right, 3, out=key_right) if package == "dask" or package == "daskcudf": df1 = df.multi.concat( [df.from_dask_array(a) for a in [c1, key_left]], axis=1 ) df1.columns = ["c1", "key"] df2 = df.multi.concat( [df.from_dask_array(a) for a in [c2, key_right]], axis=1 ) df2.columns = ["c2", "key"] if package == "daskcudf": df1 = df1.map_partitions(cudf.from_pandas) df2 = df2.map_partitions(cudf.from_pandas) else: df1 = pd.DataFrame({"c1": c1, "key": key_left}) df2 = pd.DataFrame({"c2": c2, "key": key_right}) df2["key"] = df2["key"] // 3 * 3 if ty == "string": df1["key"] = ( df1["key"] .astype("string") .str.pad(width=key_length, side="both", fillchar="0") ) df2["key"] = ( df2["key"] .astype("string") .str.pad(width=key_length, side="both", fillchar="0") ) print( "Type: inner, Size: %u x %u, Key dtype: %s" % (size, size_rhs, str(key_dtype)) ) block(df1, df2) for i in range(num_runs): start_ts = get_timestamp() df_result = df1.merge(df2, on="key") block(df_result) stop_ts = get_timestamp() print( "[Run %d] Elapsed time: %lf ms" % (i + 1, compute_elapsed_time(start_ts, stop_ts)) ) del df_result