def test_table(): ctx: CylonContext = CylonContext(config=None, distributed=False) table_path = '/tmp/user_device_tm_1.csv' pyarrow_table = pyarrow_read_csv(table_path) tb = Table(pyarrow_table, ctx) assert isinstance(tb, Table) ar_tb2 = tb.to_arrow() assert isinstance(ar_tb2, pa.Table) tb2 = Table.from_arrow(ctx, pyarrow_table) assert tb2.row_count == 272 and tb2.column_count == 4 csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb3 = read_csv(ctx, table_path, csv_read_options) assert tb3.row_count == 272 and tb3.column_count == 4 csv_write_options = CSVWriteOptions().with_delimiter(',') tb3.to_csv('/tmp/temp_record.csv', csv_write_options) tb4 = tb3.sort(1) col_names = ['use_id', 'user_id', 'platform_version', 'use_type_id'] for idx, col in enumerate(col_names): assert tb4.column_names[idx] == col assert tb4.row_count == 272 and tb4.column_count == 4 tb5 = tb3.sort('use_type_id') assert tb5.row_count == 272 and tb5.column_count == 4 for idx, col in enumerate(col_names): assert tb5.column_names[idx] == col tb6 = Table.merge([tb4, tb4]) assert tb6.row_count == 544 and tb6.column_count == 4 tb7 = tb6 assert tb7.row_count == 544 and tb7.column_count == 4 tb8 = tb3.project([0, 1]) assert tb8.row_count == 272 and tb8.column_count == 2 tb9 = tb3.project(['use_id', 'platform_version']) assert tb9.row_count == 272 and tb9.column_count == 2 project_col_names = ['use_id', 'platform_version'] for idx, col in enumerate(project_col_names): assert tb9.column_names[idx] == col ctx.finalize()
from pycylon import Table from pycylon import CylonContext ctx: CylonContext = CylonContext(config=None, distributed=False) num_rows = 10 #10_000_000 data = np.random.randn(num_rows) df = pd.DataFrame({'data{}'.format(i): data for i in range(100)}) df['key'] = np.random.randint(0, 100, size=num_rows) ct = Table.from_pandas(ctx, df) ct.set_index(range(0, num_rows)) data1 = np.random.randn(num_rows) df1 = pd.DataFrame({'data{}'.format(i): data for i in range(100)}) df1['key'] = np.random.randint(0, 100, size=num_rows) ct1 = Table.from_pandas(ctx, df1) ct1.set_index(range(0, num_rows)) t1 = time.time() new_df = df.merge(df1, how='outer') #print(new_df) t2 = time.time() #print(ct) t3 = time.time() new_ct = Table.merge(ctx, [ct, ct1]) #print(new_ct) t4 = time.time() print('pandas :', t2 - t1, 'cylon :', t4 - t3)