def test_multicol(): # cylon ctx: CylonContext = CylonContext() c1 = np.random.randint(10, size=100) c2 = np.random.randint(10, size=100) ar1 = pa.array(c1) ar2 = pa.array(c2) pa_t: pa.Table = pa.Table.from_arrays([ar1, ar2], names=['col1', 'col2']) cn_t = cn.Table.from_arrow(ctx, pa_t) cn_srt = cn_t.sort(order_by=['col1', 'col2'], ascending=[True, False]) # pandas df = pd.DataFrame({'col1': c1, 'col2': c2}, columns=['col1', 'col2']) df = df.sort_values(by=['col1', 'col2'], ascending=[True, False]) assert cn_srt.to_pandas().values.tolist() == df.values.tolist()
def test_arrow_cylon(): ctx: CylonContext = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/user_device_tm_1.csv' assert os.path.exists(table_path) tb: pa.Table = csv.read_csv(table_path) arrow_columns = len(tb.columns) arrow_rows = tb.num_rows tbc = Table.from_arrow(ctx, tb) cylon_rows = tbc.row_count cylon_columns = tbc.column_count assert arrow_columns == cylon_columns assert arrow_rows == cylon_rows ctx.finalize()
def test_setitem_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb.to_pandas() print(tb) print("-" * 80) print(pdf) tb.set_index('a', drop=True) pdf.set_index('a', drop=True, inplace=True) new_data = [i * 10 for i in range(tb.row_count)] new_tb = Table.from_list(ctx, ['new_col'], [new_data]) tb['e'] = new_tb pdf['e'] = pd.DataFrame(new_data) print(tb.index.values) print(pdf.index.values) assert tb.index.values.tolist() == pdf.index.values.tolist()
def test_dropna(): import numpy as np columns = ['col1', 'col2', 'col3'] dtype = 'int32' datum_1 = [[1.0, 2.0, 3.0, 4.0, 5.0, None], [None, 7.0, 8.0, 9.0, 10.0, 11.0], [12.0, 13.0, 14.0, 15.0, 16.0, 17.0]] datum_2 = [[1.0, 2.0, 3.0, 4.0, 5.0, None], [None, 7.0, 8.0, 9.0, 10.0, None], [12.0, 13.0, None, 15.0, 16.0, 17.0]] dataset = [datum_1, datum_2] ctx: CylonContext = CylonContext(config=None, distributed=False) ## axis=0 => column-wise inplace_ops = [True, False] hows = ['any', 'all'] axiz = [0, 1] for inplace in inplace_ops: for how in hows: for axis in axiz: for data in dataset: cn_tb = cn.Table.from_list(ctx, columns, data) df = cn_tb.to_pandas() if inplace: cn_tb.dropna(axis=axis, how=how, inplace=inplace) df.dropna(axis=1 - axis, how=how, inplace=inplace) else: cn_tb = cn_tb.dropna(axis=axis, how=how, inplace=inplace) df = df.dropna(axis=1 - axis, how=how, inplace=inplace) pdf_values = df.fillna(0).values.flatten().tolist() cn_tb_values = cn_tb.to_pandas().fillna( 0).values.flatten().tolist() assert pdf_values == cn_tb_values
def isin_op(num_rows: int, num_cols: int, filter_size: int, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) ctx.add_config("compute_engine", "arrow") df = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) ct = Table.from_pandas(ctx, df) cmp_data = np.random.randn(filter_size) cmp_data = cmp_data.tolist() pandas_time = time.time() df.isin(cmp_data) pandas_time = time.time() - pandas_time cylon_time = time.time() ct.isin(cmp_data) cylon_time = time.time() - cylon_time pandas_eval_time = time.time() pd.eval('df.isin(cmp_data)') pandas_eval_time = time.time() - pandas_eval_time return pandas_time, cylon_time, pandas_eval_time
def test_df_perf_iterrows(): ctx = CylonContext(config=None, distributed=False) dataset = [] num_rows = 100_000 num_columns = 2 data = np.random.randn(num_rows) pdf = pd.DataFrame({'data{}'.format(i): data for i in range(num_columns)}) tb1 = Table.from_pandas(ctx, pdf) tb1.set_index(tb1.column_names[0], drop=True) pdf.set_index(pdf.columns[0], drop=True, inplace=True) print(pdf) t1 = time.time() for idx, row in pdf.iterrows(): idx = idx row = row t2 = time.time() dict = tb1.to_pydict(with_index=True) indices = tb1.index.index_values rows = [] for index in indices: row = [] for col in dict: row.append(dict[col][index]) rows.append(row) for index, row in zip(indices, rows): index = index row = row t3 = time.time() print(t2 - t1, t3 - t2)
def test_isin(): dict_elems = {'num_legs': [2, 4], 'num_wings': [2, 0]} indices = ['falcon', 'dog'] indices_cmp = ['spider', 'falcon'] df = pd.DataFrame(dict_elems, index=indices) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb = cn.Table.from_pydict(ctx, dict_elems) cn_tb.set_index(indices, IndexingType.LINEAR) ######## list_comp_values = [2, 0] dict_comp_values = {'num_legs': [2, 0]} dict_comp_elements = {'num_legs': [8, 2], 'num_wings': [0, 2]} cn_tb_other = cn.Table.from_pydict(ctx, dict_comp_elements) cn_tb_other.set_index(indices_cmp, IndexingType.LINEAR) other = pd.DataFrame(dict_comp_elements, index=indices_cmp) comp_values = [list_comp_values, dict_comp_values] for comp_val in comp_values: assert df.isin(comp_val).values.tolist() == cn_tb.isin( comp_val).to_pandas().values.tolist()
def test_table_initialization_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) expected_index = [i for i in range(tb.row_count)] expected_index_1 = [0, 1, 2] print(tb) print(tb.index.values) assert expected_index == tb.index.values.tolist() pd_data = [[1, 2, 3], [4, 5, 6], [6, 7, 8]] cols = ['a', 'b', 'c'] dict_data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [6, 7, 8]} pdf = pd.DataFrame(pd_data, columns=cols) print(pdf) tb_from_pd = Table.from_pandas(ctx, pdf) print(tb_from_pd) assert tb_from_pd.index.values.tolist() == pdf.index.values.tolist() tb_from_list = Table.from_list(ctx, cols, pd_data) print(tb_from_list) print(tb_from_list.index.values) assert expected_index_1 == tb_from_list.index.values.tolist() tb_from_dict = Table.from_pydict(ctx, dict_data) print(tb_from_dict) print(tb_from_dict.index.values) assert expected_index_1 == tb_from_dict.index.values.tolist()
# create model and move it to GPU with id rank model = Network().to(rank) ddp_model = DDP(model, device_ids=[rank]) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) optimizer.zero_grad() if rank == 0: print("Training A Dummy Model") for t in range(20): for x_batch, y_batch in zip(x_train, y_train): print(f"Epoch {t}", end='\r') prediction = ddp_model(x_batch) loss = loss_fn(prediction, y_batch) optimizer.zero_grad() loss.backward() optimizer.step() cleanup() if __name__ == '__main__': ctx: CylonContext = CylonContext('mpi') rank = ctx.get_rank() world_size = ctx.get_world_size() demo_basic(rank=rank, world_size=world_size) ctx.finalize()
from pycylon import Table from pycylon.csv import csv_reader from pycylon import CylonContext ctx: CylonContext = CylonContext(config=None) tb: Table = csv_reader.read(ctx, '/tmp/user_usage_tm_1.csv', ',') print("Table Column Names") print(tb.column_names) print("Table Schema") print(tb.schema) print(tb[0].to_pandas()) print(tb[0:5].to_pandas()) print(tb[2:5].to_pandas()) print(tb[5].to_pandas()) print(tb[7].to_pandas()) tb.show_by_range(0, 4, 0, 4) print(tb[0:5].to_pandas()) ctx.finalize() import pyarrow as pa
from pycylon.csv import csv_reader from pycylon import Table from pycylon import CylonContext import argparse ctx: CylonContext = CylonContext("mpi") parser = argparse.ArgumentParser(description='PyCylon Table Conversion') parser.add_argument('--table1_path', type=str, help='Path to table 1 csv') parser.add_argument('--table2_path', type=str, help='Path to table 2 csv') args = parser.parse_args() tb1: Table = csv_reader.read(ctx, args.table1_path, ',') tb2: Table = csv_reader.read(ctx, args.table2_path, ',') configs = { 'join_type': 'left', 'algorithm': 'hash', 'left_col': 0, 'right_col': 0 } tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_col=configs['left_col'], right_col=configs['right_col'])
def test_prefix_process(): ctx: CylonContext = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table1_path = '/tmp/user_device_tm_1.csv' table2_path = '/tmp/user_usage_tm_1.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) configs = {'join_type': 'inner', 'algorithm': 'sort'} tb3: Table = tb1.join( table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[3], left_prefix="l_", right_prefix="r_", ) print(tb3.row_count, tb3.column_count) tb4: Table = tb1.join( table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['use_id'], right_on=['use_id'], left_prefix="l_1_", right_prefix="r_1_", ) tb5: Table = tb1.join( table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['use_id'], left_prefix="l_2_", right_prefix="r_2_", ) assert tb3.row_count == tb4.row_count == tb5.row_count and tb3.column_count == \ tb4.column_count == tb5.column_count expected_column_names_1 = [ 'l_use_id', 'l_user_id', 'l_platform_version', 'l_use_type_id', 'r_outgoing_mins_per_month', 'r_outgoing_sms_per_month', 'r_monthly_mb', 'r_use_id' ] expected_column_names_2 = [ 'l_1_use_id', 'l_1_user_id', 'l_1_platform_version', 'l_1_use_type_id', 'r_1_outgoing_mins_per_month', 'r_1_outgoing_sms_per_month', 'r_1_monthly_mb', 'r_1_use_id' ] expected_column_names_3 = [ 'l_2_use_id', 'l_2_user_id', 'l_2_platform_version', 'l_2_use_type_id', 'r_2_outgoing_mins_per_month', 'r_2_outgoing_sms_per_month', 'r_2_monthly_mb', 'r_2_use_id' ] assert expected_column_names_1 == tb3.column_names assert expected_column_names_2 == tb4.column_names assert expected_column_names_3 == tb5.column_names ctx.finalize()
def test_table(): ctx: CylonContext = CylonContext(config=None, distributed=False) table_path = '/tmp/user_device_tm_1.csv' pyarrow_table = pyarrow_read_csv(table_path) tb = Table(pyarrow_table, ctx) assert isinstance(tb, Table) ar_tb2 = tb.to_arrow() assert isinstance(ar_tb2, pa.Table) tb2 = Table.from_arrow(ctx, pyarrow_table) assert tb2.row_count == 272 and tb2.column_count == 4 csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb3 = read_csv(ctx, table_path, csv_read_options) assert tb3.row_count == 272 and tb3.column_count == 4 csv_write_options = CSVWriteOptions().with_delimiter(',') tb3.to_csv('/tmp/temp_record.csv', csv_write_options) tb4 = tb3.sort(1) col_names = ['use_id', 'user_id', 'platform_version', 'use_type_id'] for idx, col in enumerate(col_names): assert tb4.column_names[idx] == col assert tb4.row_count == 272 and tb4.column_count == 4 tb5 = tb3.sort('use_type_id') assert tb5.row_count == 272 and tb5.column_count == 4 for idx, col in enumerate(col_names): assert tb5.column_names[idx] == col tb6 = Table.merge([tb4, tb4]) assert tb6.row_count == 544 and tb6.column_count == 4 tb7 = tb6 assert tb7.row_count == 544 and tb7.column_count == 4 tb8 = tb3.project([0, 1]) assert tb8.row_count == 272 and tb8.column_count == 2 tb9 = tb3.project(['use_id', 'platform_version']) assert tb9.row_count == 272 and tb9.column_count == 2 project_col_names = ['use_id', 'platform_version'] for idx, col in enumerate(project_col_names): assert tb9.column_names[idx] == col ctx.finalize()
# limitations under the License. ## """ Install: PyCylon (Follow: https://cylondata.org/docs/) Run Program: python demo_pytorch.py """ import os import numpy as np import pandas as pd from pycylon import CylonContext from pycylon import Table from pycylon.csv import csv_reader ctx: CylonContext = CylonContext(config='mpi') base_path = "/tmp" rank = ctx.get_rank() user_devices_file = os.path.join(base_path, f'user_device_tm_{rank+1}.csv') user_usage_file = os.path.join(base_path, f'user_usage_tm_{rank+1}.csv') user_devices_data: Table = csv_reader.read(ctx, user_devices_file, ',') user_usage_data: Table = csv_reader.read(ctx, user_usage_file, ',') user_devices_df: pd.DataFrame = user_devices_data.to_pandas() user_usage_df: pd.DataFrame = user_usage_data.to_pandas() print(
from pycylon.csv import csv_reader from pycylon import Table from pycylon import CylonContext import argparse ctx: CylonContext = CylonContext(config="mpi") parser = argparse.ArgumentParser(description='PyCylon Table Conversion') parser.add_argument('--table1_path', type=str, help='Path to table 1 csv') parser.add_argument('--table2_path', type=str, help='Path to table 2 csv') args = parser.parse_args() tb1: Table = csv_reader.read(ctx, args.table1_path, ',') tb2: Table = csv_reader.read(ctx, args.table2_path, ',') configs = { 'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, 'right_col': 0 } tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_col=configs['left_col'], right_col=configs['right_col'])
def test_arrow_index(): from pycylon.indexing.index import IndexingType from pycylon.indexing.index import ArrowLocIndexer pdf_float = pd.DataFrame({'a': pd.Series([1, 4, 7, 10, 20, 23, 11]), 'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'), 'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'), 'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'), 'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121], dtype='int')}) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf_float) indexing_type = IndexingType.LINEAR drop_index = True print("Before Indexing") print(cn_tb) cn_tb.set_index('a', indexing_type, drop_index) pdf_float = pdf_float.set_index('a') print("After Indexing") assert cn_tb.column_names == ['b', 'c', 'd', 'e'] assert cn_tb.get_index().get_type() == IndexingType.LINEAR print(cn_tb.get_index().values) index_array = cn_tb.get_index().get_index_array() print(index_array) print(index_array.type) scalar_value = pa.scalar(10, index_array.type) print(scalar_value) arrow_loc_indexer = ArrowLocIndexer(IndexingType.LINEAR) output1 = arrow_loc_indexer.loc_with_index_range(4, 20, 0, cn_tb) print(output1) print(output1.get_index().values) output2 = arrow_loc_indexer.loc_with_index_range(4, 20, slice(0, 1), cn_tb) print(output2) print(output2.get_index().values) output3 = arrow_loc_indexer.loc_with_index_range(4, 20, [0, 1, 2], cn_tb) print(output3) print(output3.get_index().values) output4 = arrow_loc_indexer.loc_with_indices([4], 0, cn_tb) print(output4) print(output4.get_index().values) output5 = arrow_loc_indexer.loc_with_indices([4, 20], slice(0, 1), cn_tb) print(output5) print(output5.get_index().values) output6 = arrow_loc_indexer.loc_with_indices([4, 20], [0, 1, 2], cn_tb) print(output6) print(output6.get_index().values)
def test_iloc_op_mode_1(): from pycylon.indexing.index import IndexingType from pycylon.indexing.index_utils import IndexUtil pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]), 'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'), 'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'), 'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'), 'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121], dtype='int')}) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf_float) indexing_type = IndexingType.LINEAR drop_index = True print("Before Indexing") print(cn_tb) cn_tb.set_index('a', indexing_type, drop_index) pdf_float = pdf_float.set_index('a') print("After Indexing") assert cn_tb.column_names == ['b', 'c', 'd', 'e'] assert cn_tb.get_index().get_type() == IndexingType.LINEAR iloc_cn_1 = cn_tb.iloc[3:5, 1:3] iloc_pd_1 = pdf_float.iloc[3:5, 1:3] print(iloc_cn_1) print(iloc_pd_1) assert iloc_pd_1.values.tolist() == iloc_cn_1.to_pandas().values.tolist() iloc_cn_2 = cn_tb.iloc[3:5, 1:] iloc_pd_2 = pdf_float.iloc[3:5, 1:] print(iloc_cn_2) print(iloc_pd_2) assert iloc_pd_2.values.tolist() == iloc_cn_2.to_pandas().values.tolist() iloc_cn_3 = cn_tb.iloc[3:, 1:] iloc_pd_3 = pdf_float.iloc[3:, 1:] assert iloc_pd_3.values.tolist() == iloc_cn_3.to_pandas().values.tolist() iloc_cn_4 = cn_tb.iloc[:3, 1:] iloc_pd_4 = pdf_float.iloc[:3, 1:] print(iloc_cn_4) print(iloc_pd_4) assert iloc_pd_4.values.tolist() == iloc_cn_4.to_pandas().values.tolist() iloc_cn_5 = cn_tb.iloc[:, :] iloc_pd_5 = pdf_float.iloc[:, :] assert iloc_pd_5.values.tolist() == iloc_cn_5.to_pandas().values.tolist() iloc_cn_6 = cn_tb.iloc[[0, 2, 3], :] iloc_pd_6 = pdf_float.iloc[[0, 2, 3], :] assert iloc_pd_6.values.tolist() == iloc_cn_6.to_pandas().values.tolist()
def test_loc_op_mode_2(): from pycylon.indexing.index import IndexingType pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]), 'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'), 'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'), 'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'), 'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121], dtype='int')}) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf_float) indexing_type = IndexingType.LINEAR drop_index = True print("Before Indexing") print(cn_tb) cn_tb.set_index('a', indexing_type, drop_index) pdf_float = pdf_float.set_index('a') print("After Indexing") assert cn_tb.column_names == ['b', 'c', 'd', 'e'] assert cn_tb.get_index().get_type() == IndexingType.LINEAR loc_cn_1 = cn_tb.loc["7":"20", 'c':'e'] loc_pd_1 = pdf_float.loc["7":"20", 'c':'e'] assert loc_pd_1.values.tolist() == loc_cn_1.to_pandas().values.tolist() assert loc_cn_1.get_index().get_index_array() == pa.array(loc_pd_1.index) # assert loc_cn_1.get_arrow_index().get_index_array() == pa.array(loc_pd_1.index) loc_cn_2 = cn_tb.loc["7":"20", 'd':] loc_pd_2 = pdf_float.loc["7":"20", 'd':] assert loc_pd_2.values.tolist() == loc_cn_2.to_pandas().values.tolist() assert loc_cn_2.get_index().get_index_array() == pa.array(loc_pd_2.index) # assert loc_cn_2.get_arrow_index().get_index_array() == pa.array(loc_pd_2.index) loc_cn_3 = cn_tb.loc["7":, 'd':] loc_pd_3 = pdf_float.loc["7":, 'd':] assert loc_pd_3.values.tolist() == loc_cn_3.to_pandas().values.tolist() assert loc_cn_3.get_index().get_index_array() == pa.array(loc_pd_3.index) # assert loc_cn_3.get_arrow_index().get_index_array() == pa.array(loc_pd_3.index) loc_cn_4 = cn_tb.loc[:"7", 'd':] loc_pd_4 = pdf_float.loc[:"7", 'd':] assert loc_pd_4.values.tolist() == loc_cn_4.to_pandas().values.tolist() assert loc_cn_4.get_index().get_index_array() == pa.array(loc_pd_4.index) # assert loc_cn_4.get_arrow_index().get_index_array() == pa.array(loc_pd_4.index) loc_cn_5 = cn_tb.loc[:, 'd':] loc_pd_5 = pdf_float.loc[:, 'd':] assert loc_pd_5.values.tolist() == loc_cn_5.to_pandas().values.tolist() assert loc_cn_5.get_index().get_index_array() == pa.array(loc_pd_5.index) # assert loc_cn_5.get_arrow_index().get_index_array() == pa.array(loc_pd_5.index) loc_cn_6 = cn_tb.loc[["7", "20"], 'd':] loc_pd_6 = pdf_float.loc[["7", "20"], 'd':] assert loc_pd_6.values.tolist() == loc_cn_6.to_pandas().values.tolist() assert loc_cn_6.get_index().get_index_array() == pa.array(loc_pd_6.index)
def test_table_is_in_dev(): from typing import List from pyarrow.compute import and_ from pyarrow import compute as a_compute col_validity = [False, True] # comparison data needs to be broadcasted in such a manner that it equals to the number of # rows in the table cols = 2 rows = 4 col_names_ = ['col-1', 'col-2'] comp_col_names_ = ['col-11', 'col-2'] row_indices_ = ['1', '2', '3', '4'] row_indices_cmp_ = ['1', '21', '3', '41'] data = [[2, 4, 3, 1], [0, 2, 1, 2]] cmp_data = [[12, 4, 13, 1], [10, 2, 12, 3]] ctx: CylonContext = CylonContext(config=None, distributed=False) tb = cn.Table.from_list(ctx, col_names_, data) tb.set_index(row_indices_) tb_cmp = cn.Table.from_list(ctx, comp_col_names_, cmp_data) tb_cmp.set_index(row_indices_cmp_) def compare_array_like_values(l_org_ar, l_cmp_ar, skip_null=True): return a_compute.is_in(l_org_ar, value_set=l_cmp_ar, skip_nulls=skip_null) def broadcast(ar, broadcast_coefficient=1): bcast_ar = [] for elem in ar: bcast_elems = [] for i in range(broadcast_coefficient): bcast_elems.append(elem.as_py()) bcast_ar.append(pa.array(bcast_elems)) return bcast_ar def compare_two_arrays(l_ar, r_ar): return a_compute.and_(l_ar, r_ar) def compare_row_and_column(row, columns): comp_res = [] for column in columns: print(type(column), type(row)) comp_res.append(compare_two_arrays(l_ar=row, r_ar=column)) return comp_res def populate_column_with_single_value(value, row_count): column_values = [] for i in range(row_count): column_values.append(value) return column_values def tb_compare_values(tb, tb_cmp, skip_null=True): col_names = tb.column_names comp_col_names = tb_cmp.column_names row_indices = tb.index.index_values row_indices_cmp = tb_cmp.index.index_values col_comp_res = compare_array_like_values( l_org_ar=pa.array(col_names), l_cmp_ar=pa.array(comp_col_names)) row_comp_res = compare_array_like_values( l_org_ar=pa.array(row_indices), l_cmp_ar=pa.array(row_indices_cmp)) bcast_col_comp_res = broadcast(ar=col_comp_res, broadcast_coefficient=rows) row_col_comp = compare_row_and_column(row=row_comp_res, columns=bcast_col_comp_res) tb_ar = tb.to_arrow().combine_chunks() tb_cmp_ar = tb_cmp.to_arrow().combine_chunks() col_data_map = {} for col_name, validity, row_col_validity in zip( col_names, col_comp_res, row_col_comp): if validity.as_py(): chunk_ar_org = tb_ar.column(col_name) chunk_ar_cmp = tb_cmp_ar.column(col_name) data_cmp_res = a_compute.is_in(chunk_ar_org, value_set=chunk_ar_cmp, skip_nulls=skip_null) print(data_cmp_res, row_col_validity) col_data_map[col_name] = compare_two_arrays( data_cmp_res, row_col_validity) else: col_data_map[col_name] = pa.array( populate_column_with_single_value(False, tb.row_count)) is_in_values = list(col_data_map.values()) return cn.Table.from_list(tb.context, col_names, is_in_values) new_tb = tb_compare_values(tb, tb_cmp) print(new_tb)
def test_distributed_ra(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank = ctx.get_rank() size = ctx.get_world_size() assert size == 4 table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv' table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv' assert os.path.exists(table1_path) assert os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print("First Hello World From Rank {}, Size {}".format( ctx.get_rank(), ctx.get_world_size())) tb3: Table = tb1.distributed_join(table=tb2, join_type='inner', algorithm='hash', left_on=[0], right_on=[0]) tb4: Table = tb1.distributed_union(tb2) tb5: Table = tb1.distributed_subtract(tb2) tb6: Table = tb1.distributed_intersect(tb2) ctx.barrier() join_row_count = tb3.row_count join_column_count = tb3.column_count subtract_row_count = tb5.row_count subtract_column_count = tb5.column_count union_row_count = tb4.row_count union_column_count = tb4.column_count intersect_row_count = tb6.row_count intersect_column_count = tb6.column_count if rank == 0: assert join_row_count == 1424 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 112 and union_column_count == 4 assert intersect_row_count == 112 and intersect_column_count == 4 if rank == 1: assert join_row_count == 1648 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 122 and union_column_count == 4 assert intersect_row_count == 122 and intersect_column_count == 4 if rank == 2: assert join_row_count == 2704 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 102 and union_column_count == 4 assert intersect_row_count == 102 and intersect_column_count == 4 if rank == 3: assert join_row_count == 1552 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 144 and union_column_count == 4 assert intersect_row_count == 144 and intersect_column_count == 4
def test_concat_op(): from pycylon.net import MPIConfig mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) columns = ['c1', 'c2', 'c3'] dataset_1 = [[1, 2, 3, 4, 5], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]] dataset_2 = [[1, 20, 3, 4, 50], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]] dataset_3 = [[1, 20, 3, 40, 50, 60], [21, 31, 41, 51, 50, 70], [32, 42, 52, 62, 72, 82]] tb1 = Table.from_list(ctx, columns, dataset_1) tb1 = tb1.add_prefix('d1_') tb2 = Table.from_list(ctx, columns, dataset_2) tb2 = tb2.add_prefix('d2_') tb3 = Table.from_list(ctx, columns, dataset_3) tb3 = tb3.add_prefix('d3_') tb4 = Table.from_list(ctx, columns, dataset_3) tb4 = tb4.add_prefix('d1_') pdf1 = tb1.to_pandas() pdf2 = tb2.to_pandas() pdf3 = tb3.to_pandas() pdf4 = tb4.to_pandas() print(tb1) print("-" * 80) print(tb2) tb1.set_index(tb1.column_names[0], drop=True) tb2.set_index(tb2.column_names[0], drop=True) tb3.set_index(tb3.column_names[0], drop=True) tb4.set_index(tb4.column_names[0], drop=True) print("*" * 80) print("Indexed table") print(tb1) print("*" * 80) pdf1.set_index(pdf1.columns[0], drop=True, inplace=True) pdf2.set_index(pdf2.columns[0], drop=True, inplace=True) pdf3.set_index(pdf3.columns[0], drop=True, inplace=True) pdf4.set_index(pdf4.columns[0], drop=True, inplace=True) print("=" * 80) print("axis=1") print("=" * 80) res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=1) print(res_pdf_1) print("-" * 80) tables = [tb1, tb2] tb1_index_values = tb1.index.index_values tb2_index_values = tb2.index.index_values res_tb_1 = Table.concat(tables, join='inner', axis=1) print(res_tb_1) print("-" * 80) res_pdf_2 = pd.concat([pdf1, pdf2], join='inner', axis=1) print(res_pdf_2) assert res_pdf_2.values.tolist() == res_tb_1.to_pandas().values.tolist() assert res_tb_1.index.index_values == res_pdf_2.index.values.tolist() print("-" * 80) print(tb1.to_arrow()) print(tb2.to_arrow()) print(tb1.index.index_values, tb1_index_values) print(tb2.index.index_values, tb2_index_values) assert tb1.index.index_values.sort() == tb1_index_values.sort() assert tb2.index.index_values.sort() == tb2_index_values.sort() print("=" * 80) print("axis=0") print("=" * 80) res_pdf_3 = pd.concat([pdf1, pdf4], join='inner', axis=0) print(tb1.column_names, tb4.column_names) res_tb_2 = Table.concat([tb1, tb4], join='inner', axis=0) print(res_tb_2) print(res_tb_2.index.index_values) print(res_pdf_3) print(res_pdf_3.index.values.tolist()) assert res_pdf_3.values.tolist() == res_tb_2.to_pandas().values.tolist() assert res_tb_2.index.index_values == res_pdf_3.index.values.tolist()
def test_concat_table(): """ For Cylon concat operation: We can check for indexing column if default the index array contains [0,num_records-1) If indexed, the indexed column will be compared. We can use existing join ops. Algorithm ========= axis=1 (regular join op considering a column) ---------------------------------------------- 1. If indexed or not, do a reset_index op (which will add the new column as 'index' in both tables) 2. Do the regular join by considering the 'index' column 3. Set the index by 'index' in the resultant table axis=0 (stacking tables or similar to merge function) ----------------------------------------------------- assert: column count must match the two tables are stacked upon each other in order The index is created by concatenating two indices """ ctx: CylonContext = CylonContext(config=None, distributed=False) columns = ['c1', 'c2', 'c3'] dataset_1 = [[1, 2, 3, 4, 5], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]] dataset_2 = [[1, 20, 3, 4, 50], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]] dataset_3 = [[1, 20, 3, 40, 50, 60], [21, 31, 41, 51, 50, 70], [32, 42, 52, 62, 72, 82]] tb1 = Table.from_list(ctx, columns, dataset_1) tb1 = tb1.add_prefix('d1_') tb2 = Table.from_list(ctx, columns, dataset_2) tb2 = tb2.add_prefix('d2_') tb3 = Table.from_list(ctx, columns, dataset_3) tb3 = tb3.add_prefix('d3_') tb4 = Table.from_list(ctx, columns, dataset_3) tb4 = tb4.add_prefix('d1_') pdf1 = tb1.to_pandas() pdf2 = tb2.to_pandas() pdf3 = tb3.to_pandas() pdf4 = tb4.to_pandas() print(tb1) print("-" * 80) print(tb2) tb1.set_index(tb1.column_names[0], drop=True) tb2.set_index(tb2.column_names[0], drop=True) tb3.set_index(tb3.column_names[0], drop=True) print("*" * 80) print("Indexed table") print(tb1) print("*" * 80) print("Reset_Index table") tb1.reset_index() print(tb1) print("*" * 80) pdf1.set_index(pdf1.columns[0], drop=True, inplace=True) pdf2.set_index(pdf2.columns[0], drop=True, inplace=True) pdf3.set_index(pdf3.columns[0], drop=True, inplace=True) print("=" * 80) print("axis=1") print("=" * 80) res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=1) print(res_pdf_1) print("-" * 80) res_pdf_2 = pd.concat([pdf1, pdf3], join='inner', axis=1) print(res_pdf_2) print("-" * 80) print("=" * 80) print("axis=0") print("=" * 80) res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=0) print(res_pdf_1) print("-" * 80) res_pdf_2 = pd.concat([pdf1, pdf3], join='inner', axis=0) print(res_pdf_2) print("-" * 80) res_pdf_3 = pd.concat([pdf1, pdf4], join='inner', axis=0) print(res_pdf_3) print("-" * 80) print("Multi Table Concat 1") res_pdf_4 = pd.concat([pdf1, pdf2, pdf3], join='inner', axis=1) print(res_pdf_4) print("Multi Table Concat 2") res_pdf_5 = pd.concat([pdf2, pdf3, pdf1], join='inner', axis=1) print(res_pdf_5)
def test_math_i_ops_for_scalar(): """ TODO: Enhance Test case and functionality Check the following case : https://github.com/cylondata/cylon/issues/229 >>> from operator import __iadd__ >>> assert __iadd__(cylon_table, value) == (cylon_table += value) >>> Failure ... """ npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50], [12.2, 13.2, 16.4, 12.2, 10.8]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) cn_tb_1 = cn_tb pdf_1 = pdf # test column addition cn_tb_1['0'] += 2 pdf_1[0] += 2 assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist() cn_tb_1['0'] -= 2 pdf_1[0] -= 2 assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist() cn_tb_1['0'] *= 2 pdf_1[0] *= 2 assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist() cn_tb_1['0'] /= 2 pdf_1[0] /= 2 assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist() # test table division cn_tb_2 = cn_tb pdf_2 = pdf cn_tb_2 += 2 pdf += 2 assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist() cn_tb_2 -= 2 pdf -= 2 assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist() cn_tb_2 *= 2 pdf *= 2 assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist() cn_tb_2 /= 2 pdf /= 2 assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## import numpy as np import pyarrow as pa import pandas as pd import pycylon as cn from pycylon import CylonContext ctx: CylonContext = CylonContext(config=None, distributed=False) columns = 2 data1 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32) data2 = np.array([10, 11, 12, 13, 14, 15], dtype=np.float32) nd_array_list = [data1, data2] ar_array: pa.array = pa.array(nd_array_list) ar_table: pa.Table = pa.Table.from_arrays(nd_array_list, names=['x0', 'x1']) print(ar_table) ar1 = pa.array([1, 2, 3, 4])
def test_aggregate(): ctx: CylonContext = CylonContext(config=None, distributed=False) columns = 2 data1 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32) data2 = np.array([10, 11, 12, 13, 14, 15], dtype=np.float32) nd_array_list = [data1, data2] ar_array: pa.array = pa.array(nd_array_list) ar_table: pa.Table = pa.Table.from_arrays(nd_array_list, names=['x0', 'x1']) ar1 = pa.array([1, 2, 3, 4]) ar2 = pa.array(['a', 'b', 'c', 'd']) ar_tb2: pa.Table = pa.Table.from_arrays([ar1, ar2], names=['col1', 'col2']) assert isinstance(ar_tb2, pa.Table) col_names = ['col1', 'col2'] cn_tb1 = cn.Table.from_numpy(ctx, col_names, nd_array_list) assert cn_tb1.row_count == data1.shape[0] and cn_tb1.column_count == len( nd_array_list) data_list = [[1, 2, 3, 4], ['p', 'q', 'r', 's']] cn_tb2 = cn.Table.from_list(ctx, col_names, data_list) assert cn_tb2.row_count == len( data_list[0]) and cn_tb2.column_count == len(data_list) dict1 = {'col1': [1, 2], 'col2': ['a', 'b']} ar_tb3: pa.Table = pa.Table.from_pydict(dict1) cn_tb3: cn.Table = cn.Table.from_pydict(ctx, dict1) assert cn_tb3.row_count == len( dict1['col1']) and cn_tb3.column_count == len(dict1) pdf = pd.DataFrame(dict1) # df, Schema schema=None, preserve_index=None, nthreads=None, columns=None, bool safe=True cn_tb4: cn.Table = cn.Table.from_pandas(ctx, pdf) assert cn_tb4.row_count == len( dict1['col1']) and cn_tb4.column_count == len(dict1) dict2 = {'col1': [1, 2, 3], 'col2': [2, 4, 3]} cn_tb5: cn.Table = cn.Table.from_pydict(ctx, dict2) assert cn_tb5.row_count == len( dict2['col1']) and cn_tb5.column_count == len(dict2) npy = cn_tb5.to_numpy() assert npy.shape == (len(dict2['col1']), len(dict2)) dict3 = cn_tb5.to_pydict() assert dict3 == dict2 for key1, key2 in zip(dict3.keys(), cn_tb5.column_names): assert key1 == key2 assert cn_tb5.to_numpy().shape == (len(dict2['col1']), len(dict2)) ## Aggregate Sum cn_tb6 = cn_tb5.sum('col1') assert cn_tb6.to_numpy()[0][0] == sum(dict2['col1']) cn_tb7 = cn_tb5.sum(0) assert cn_tb7.to_numpy()[0][0] == sum(dict2['col1']) ## Aggregate Count cn_tb8 = cn_tb5.count('col1') assert cn_tb8.to_numpy()[0][0] == len(dict2['col1']) cn_tb9 = cn_tb5.count(0) assert cn_tb9.to_numpy()[0][0] == len(dict2['col1']) ## Aggregate Min cn_tb10 = cn_tb5.min('col1') assert cn_tb10.to_numpy()[0][0] == min(dict2['col1']) cn_tb11 = cn_tb5.min(0) assert cn_tb11.to_numpy()[0][0] == min(dict2['col1']) ## Aggregate Max cn_tb12 = cn_tb5.max('col1') assert cn_tb12.to_numpy()[0][0] == max(dict2['col1']) cn_tb13 = cn_tb5.max(0) assert cn_tb13.to_numpy()[0][0] == max(dict2['col1']) from pycylon.data.aggregates import AggregationOp op1 = AggregationOp.SUM assert (op1 == AggregationOp.SUM) df = pd.DataFrame({ 'AnimalId': [1, 1, 2, 2, 3, 4, 4, 3], 'Max Speed': [380., 370., 24., 26., 23.1, 300.1, 310.2, 25.2] }) ar_tb_gb = pa.Table.from_pandas(df) assert isinstance(ar_tb_gb, pa.Table) cn_tb_gb = cn.Table.from_arrow(ctx, ar_tb_gb) assert isinstance(cn_tb_gb, cn.Table) pdf1 = df.groupby(['AnimalId']).sum() cn_tb_gb_res = cn_tb_gb.groupby(0, [1], [AggregationOp.SUM]).sort(0) for val1, val2 in zip(cn_tb_gb_res.to_pydict()['Max Speed'], pdf1.to_dict()['Max Speed'].values()): assert val1 == val2 cn_tb_gb_res1 = cn_tb_gb.groupby(0, ['Max Speed'], [AggregationOp.SUM]).sort(0) for val1, val2 in zip(cn_tb_gb_res1.to_pydict()['Max Speed'], pdf1.to_dict()['Max Speed'].values()): assert val1 == val2 pdf2 = df.groupby(['AnimalId']).min() cn_tb_gb_res2 = cn_tb_gb.groupby(0, ['Max Speed'], [AggregationOp.MIN]).sort(0) for val1, val2 in zip(cn_tb_gb_res2.to_pydict()['Max Speed'], pdf2.to_dict()['Max Speed'].values()): assert val1 == val2 pdf3 = df.groupby(['AnimalId']).max() cn_tb_gb_res3 = cn_tb_gb.groupby(0, ['Max Speed'], [AggregationOp.MAX]).sort(0) for val1, val2 in zip(cn_tb_gb_res3.to_pydict()['Max Speed'], pdf3.to_dict()['Max Speed'].values()): assert val1 == val2