def math_op_base(): ctx: CylonContext = CylonContext(config=None, distributed=False) num_rows = 10_000_000 data = np.random.randn(num_rows) df = pd.DataFrame({'data{}'.format(i): data for i in range(100)}) np_key = np.random.randint(0, 100, size=num_rows) np_all = df.to_numpy() df['key'] = np_key rb = pa.record_batch(df) t = pa.Table.from_pandas(df) ct = Table.from_pandas(ctx, df) t1 = time.time() np_key + 1 t2 = time.time() ct['key'] + 1 t3 = time.time() df['key'] + 1 t4 = time.time() artb = ct.to_arrow().combine_chunks() ar_key = ct['key'].to_arrow().combine_chunks().columns[0].chunks[0] pc.add(ar_key, 1) t5 = time.time() print(f"Numpy Time: {t2 - t1} s") print(f"PyCylon Time: {t3 - t2} s") print(f"Pandas Time: {t4 - t3} s") print(f"PyArrow Time: {t5 - t4} s")
def test_input_type_conversion(): # Automatic array conversion from Python arr = pc.add([1, 2], [4, None]) assert arr.to_pylist() == [5, None] # Automatic scalar conversion from Python arr = pc.add([1, 2], 4) assert arr.to_pylist() == [5, 6] # Other scalar type assert pc.equal(["foo", "bar", None], "foo").to_pylist() == [True, False, None]
def clean_cat(arr, categories=[]): arr = arr.cast(pa.string()).dictionary_encode() dic = arr.dictionary.to_pylist() if categories: d = { i: (categories.index(v) + 1 if v in categories else 0) for i, v in enumerate(dic) } d[-1] = 0 # NULLs -> 0 return (pa.array( np.vectorize(d.get)(arr.indices.fill_null(-1).to_numpy())), ['Unknown'] + categories) else: return (c.add(arr.indices, pa.array([1], type=pa.int32())[0]).fill_null(0), ['Unknown'] + dic)
def test_arithmetic_add(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) result = pc.add(left, right) expected = pa.array([1, 1, 4, 6, 8]) assert result.equals(expected)
def my_add(arr1, arr2, **kwargs): return pc.add(arr1, arr2)