def test_isin_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() filter_isin = [20, 11, 23] print(tb1) print(pdf) tb1.set_index('a', drop=True) pdf.set_index('a', inplace=True) filter_pdf: pd.DataFrame = pdf[['b', 'c']].iloc[0:5] tb_res = tb1[tb1['b'].isin(filter_isin)] pdf_res = pdf[pdf['b'].isin(filter_isin)] print(tb_res) print(pdf_res) assert tb_res.to_pandas().values.tolist() == pdf_res.values.tolist() print(tb_res.index.values) print(pdf_res.index.values) assert tb_res.index.values.tolist() == pdf_res.index.values.tolist()
def test_df_iterrows(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() tb1.set_index(tb1.column_names[0], drop=True) pdf.set_index(pdf.columns[0], drop=True, inplace=True) num_records = tb1.row_count print(pdf) for idx, row in pdf.iterrows(): print(idx) print(row) dict = tb1.to_pydict(with_index=False) indices = tb1.index.index_values rows = [] for index_id, index in enumerate(indices): row = [] for col in dict: row.append(dict[col][index_id]) rows.append(row) for index, row in zip(indices, rows): print(index, row) for index1, row1, composite in zip(indices, rows, pdf.iterrows()): index2 = composite[0] row2 = composite[1].tolist() assert index1 == index2 assert row1 == row2
def test_distributed_sort(): import numpy as np mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank = ctx.get_rank() size = ctx.get_world_size() assert size == 4 table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv' assert os.path.exists(table1_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) print(tb1) tb2 = tb1.distributed_sort(order_by='use_id') col_data = tb2['use_id'].to_numpy() col_data = np.reshape(col_data, (col_data.shape[0])) def is_sort_array(array): for i in range(array.shape[0] - 1): if array[i] > array[i + 1]: return False return True assert is_sort_array(col_data)
def test_isin_column(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() tb1.set_index(tb1.column_names[0], drop=True) pdf.set_index(pdf.columns[0], drop=True, inplace=True) print(tb1) print(pdf) isin_values = [10, 20, 30, 5, 2, 8] tbx = tb1['b'].isin(isin_values) pdfx = pdf['b'].isin(isin_values) print(tbx) print(pdfx) tb_list = tbx.to_pandas().values.flatten().tolist() pd_list = pdfx.values.tolist() assert tb_list == pd_list print(tb_list) print(pd_list)
def test_unique(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15] print("Original Data") print(pdf) tb2 = tb1['b'].unique() pdf2 = pdf['b'].unique() tb2.show() print("Unique Pdf") print(pdf2) print(type(pdf2)) print("Unique Cylon") print(tb2) tb3_list = list(tb2.to_pydict().items())[0][1] pdf3_list = pdf2.tolist() assert tb3_list == pdf3_list set_pdf4 = set(pdf2) set_tb4 = set(tb3_list) assert set_tb4 == set_pdf4 ctx.finalize()
def test_df_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() print(pdf.columns[0]) pdf1 = pdf[[pdf.columns[0]]] print(pdf) print(pdf1) pdf3 = pdf.set_index(pdf.columns[0], drop=True) print(pdf3) artb = pa.Table.from_pandas(df=pdf3, schema=None, preserve_index=True, nthreads=None, columns=None, safe=False) print(artb)
def test_properties(): ctx: CylonContext = CylonContext(config=None, distributed=False) table1_path = '/tmp/user_usage_tm_1.csv' table2_path = '/tmp/user_usage_tm_2.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb: Table = read_csv(ctx, table1_path, csv_read_options) pdf = tb.to_pandas() def generate_filter_and_result(op, column: str, input, comparison_value): if column: filter = op(input[column], comparison_value) return filter, input[filter] else: filter = op(input, comparison_value) return filter, input[filter] def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table, pdf_filter: DataFrame, pdf_result: DataFrame, is_full_table): if is_full_table: assert tb_filter.to_pandas().values.tolist( ) == pdf_filter.values.tolist() assert tb_result.to_pandas().fillna( 0).values.tolist() == pdf_result.fillna(0).values.tolist() else: assert tb_filter.to_pandas().values.flatten().tolist( ) == pdf_filter.values.tolist() assert tb_result.to_pandas().values.tolist( ) == pdf_result.values.tolist() ops = [ operator.__eq__, operator.__ne__, operator.__lt__, operator.__gt__, operator.__le__, operator.__ge__ ] value = 519.12 columns = ['monthly_mb', None] is_full_table_flags = [False, True] for column, is_full_table in zip(columns, is_full_table_flags): for op in ops: tb_filter_all, tb_filter_all_result = generate_filter_and_result( op, column, tb, value) pdf_filter_all, pdf_filter_all_result = generate_filter_and_result( op, column, pdf, value) do_comparison_on_pdf_and_tb(tb_filter=tb_filter_all, tb_result=tb_filter_all_result, pdf_filter=pdf_filter_all, pdf_result=pdf_filter_all_result, is_full_table=is_full_table)
def test_multi_process(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank, size = ctx.get_rank(), ctx.get_world_size() assert size == 4 csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table1_path = f'/tmp/user_device_tm_{rank + 1}.csv' table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print(tb1.column_names) print(tb2.column_names) configs = { 'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, 'right_col': 0 } tb3: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[3]) tb4: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['use_id'], right_on=['use_id']) tb5: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['use_id']) assert tb3.column_count == tb4.column_count == tb4.column_count == 8 if rank == 0: assert tb3.row_count == tb4.row_count == tb5.row_count == 640 if rank == 1: assert tb3.row_count == tb4.row_count == tb5.row_count == 624 if rank == 2: assert tb3.row_count == tb4.row_count == tb5.row_count == 592 if rank == 3: assert tb3.row_count == tb4.row_count == tb5.row_count == 688
def test_col_access(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() print(tb1) tbx = tb1[tb1.column_names[0]] print(tbx) npy = tbx.to_numpy().flatten().tolist() print(npy)
def test_read_csv_with_use_cols(): ctx = CylonContext(config=None, distributed=False) use_cols = ['a', 'b'] csv_read_options = CSVReadOptions() \ .use_threads(True) \ .block_size(1 << 30) \ .use_cols(use_cols) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf = pd.read_csv(table_path, usecols=use_cols) assert tb1.column_names == use_cols == pdf.columns.tolist()
def test_read_csv_with_skiprows(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions() \ .use_threads(True) \ .block_size(1 << 30) \ .skip_rows(1) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf = pd.read_csv(table_path, skiprows=1) print(tb1) print("-" * 80) print(pdf) assert tb1.to_pandas().values.tolist() == pdf.values.tolist()
def multi_process(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print(tb1.column_names) print(tb2.column_names) configs = {'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, 'right_col': 0} tb3: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[3] ) tb3.show() tb4: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['use_id'], right_on=['use_id'] ) tb4.show() tb4: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['use_id'] ) tb4.show() # tb5: Table = tb1.distributed_join(ctx, table=tb2, # join_type=configs['join_type'], # algorithm=configs['algorithm'], # on=[0] # ) # # tb5.show() ctx.finalize()
def test_uno_data_load(): file_path = "/home/vibhatha/sandbox/UNO/Benchmarks/Data/Pilot1/" file_name = "combined_single_response_agg" save_file = "/tmp/combined_single_response_agg_enum" path = os.path.join(file_path, file_name) csv_read_options = CSVReadOptions().use_threads(True).block_size( 1 << 30).with_delimiter("\t") t1 = time.time() tb = read_csv(ctx, path, csv_read_options) t2 = time.time() print(t2 - t1) print(tb.shape) print(tb.to_arrow()) print(tb.column_names) tb_drugs = tb['DRUG'] tb_drug = tb.unique(columns=['DRUG'], keep='first')['DRUG'] tb_drug_ar_tb = tb_drug.to_arrow().combine_chunks() tb_drug_list = tb_drug_ar_tb.column(0).chunk(0).tolist() tb_drugs_ar_tb = tb_drugs.to_arrow().combine_chunks() tb_drugs_list = tb_drugs_ar_tb.column(0).chunk(0).tolist() tb_drug_list_dict = {} for index, drug in enumerate(tb_drug_list): tb_drug_list_dict[drug] = index tb_drugs_enum_list = [] for drug in tb_drugs_list: tb_drugs_enum_list.append(tb_drug_list_dict[drug]) tb_enum_drug = Table.from_list(ctx, ['DRUG'], [tb_drugs_enum_list]) print(tb_enum_drug.shape, tb_drugs.shape) tb = tb.drop(['DRUG']) tb['DRUG'] = tb_enum_drug print(tb.to_arrow()) pdf = tb.to_pandas() pdf.to_csv(save_file, sep="\t")
def test_read_csv_with_na_values(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions() \ .use_threads(True) \ .block_size(1 << 30) \ .na_values(['na', 'none']) table_path = 'data/input/null_data.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf = pd.read_csv(table_path, na_values=['na', 'none']) print(tb1) print("-" * 80) print(pdf) tb1 = tb1.fillna(0) pdf = pdf.fillna(0) assert tb1.to_pandas().values.tolist() == pdf.values.tolist()
def test_single_process(): ctx: CylonContext = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table1_path = '/tmp/user_device_tm_1.csv' table2_path = '/tmp/user_usage_tm_1.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) configs = {'join_type': 'inner', 'algorithm': 'sort'} tb3: Table = tb1.join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[3] ) print(tb3.row_count, tb3.column_count) tb4: Table = tb1.join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['use_id'], right_on=['use_id'] ) tb5: Table = tb1.join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['use_id'] ) # tb6: Table = tb1.join(ctx, table=tb2, # join_type=configs['join_type'], # algorithm=configs['algorithm'], # on=[0] # ) # # tb5.show() assert tb3.row_count == tb4.row_count == tb5.row_count and tb3.column_count == \ tb4.column_count == tb5.column_count ctx.finalize()
def test_drop(): ctx: CylonContext = CylonContext(config=None, distributed=False) table1_path = '/tmp/user_usage_tm_1.csv' assert os.path.exists(table1_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb: Table = read_csv(ctx, table1_path, csv_read_options) drop_column = 'outgoing_sms_per_month' tb_new = tb.drop([drop_column]) assert not tb_new.column_names.__contains__(drop_column)
def join_op(num_rows: int, num_cols: int, algorithm: str, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) pdf_left = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor, stringify=False) pdf_right = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor, stringify=False) # NOTE: sort join breaks when loaded data in-memory via Pandas dataframe pdf_left.to_csv("/tmp/left_table.csv", index=False) pdf_right.to_csv("/tmp/right_table.csv", index=False) csv_read_options = CSVReadOptions() \ .use_threads(True) \ .block_size(1 << 30) tb_left = read_csv(ctx, "/tmp/left_table.csv", csv_read_options) tb_right = read_csv(ctx, "/tmp/right_table.csv", csv_read_options) join_col = tb_left.column_names[0] cylon_time = time.time() tb2 = tb_left.join(tb_right, join_type='inner', algorithm=algorithm, on=[join_col]) cylon_time = time.time() - cylon_time pandas_time = time.time() pdf2 = pdf_left.join(pdf_right, how="inner", on=join_col, lsuffix="_l", rsuffix="_r") pandas_time = time.time() - pandas_time pandas_eval_time = time.time() pdf2 = pd.eval( "pdf_left.join(pdf_right, how='inner', on=join_col, lsuffix='_l', rsuffix='_r')" ) pandas_eval_time = time.time() - pandas_eval_time ctx.finalize() return pandas_time, cylon_time, pandas_eval_time
def test_series_tolist(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() series = pdf[pdf.columns[0]] print(type(series)) lst = series.tolist() npy = series.to_numpy() print(lst) idx = series.index.values print(type(idx), idx)
def test_iterrows(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() tb1.set_index(tb1.column_names[0], drop=True) pdf.set_index(pdf.columns[0], drop=True, inplace=True) for p, c in zip(pdf.iterrows(), tb1.iterrows()): idx_p = p[0] row_p = p[1].tolist() idx_c = c[0] row_c = c[1] assert idx_p == idx_c assert row_p == row_c
def test_isin_with_getitem(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb.to_pandas() tb.set_index(tb.column_names[0], drop=True) pdf.set_index(pdf.columns[0], drop=True, inplace=True) assert tb.index.values.tolist() == pdf.index.values.tolist() compare_values = [4, 1, 10, 100, 150] tb_res_isin = tb.index.isin(compare_values) pdf_res_isin = pdf.index.isin(compare_values) assert tb_res_isin.tolist() == pdf_res_isin.tolist() print(tb_res_isin) print(pdf_res_isin) pdf1 = pdf[pdf_res_isin] print("Pandas Output") print(pdf1) print(pdf1.index.values) tb_filter = Table.from_list(ctx, ['filter'], [tb_res_isin.tolist()]) tb1 = tb[tb_filter] resultant_index = tb.index.values[tb_res_isin].tolist() print(resultant_index) tb1.set_index(resultant_index) print("PyCylon Output") print(tb1) print(tb1.index.values) assert pdf1.values.tolist() == tb1.to_pandas().values.tolist() print(tb1.index.values) print(pdf1.index.values) assert tb1.index.values.tolist() == pdf1.index.values.tolist()
def test_filter(): ctx: CylonContext = CylonContext(config=None, distributed=False) table1_path = '/tmp/user_usage_tm_1.csv' table2_path = '/tmp/user_usage_tm_2.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb: Table = read_csv(ctx, table1_path, csv_read_options) column_name = 'monthly_mb' ops = [operator.__or__, operator.__and__] or_limits = [600, 5000, 15000] and_limits = [0, 5000, 1000] comp_op_or = [operator.__gt__, operator.__le__, operator.__gt__] comp_op_and = [operator.__gt__, operator.__le__, operator.__gt__] limits = [or_limits, and_limits] comp_ops = [comp_op_or, comp_op_and] for op, limit, comp_op in zip(ops, limits, comp_ops): print("Op ", op) tb_cond_1 = comp_op[0](tb[column_name], limit[0]) tb_cond_2 = comp_op[1](tb[column_name], limit[1]) tb_cond_3 = comp_op[2](tb[column_name], limit[2]) res_1_op = op(tb_cond_1, tb_cond_2) res_2_op = op(res_1_op, tb_cond_3) res_1 = tb[res_1_op] res_2 = tb[res_2_op] column_pdf_1 = res_1[column_name].to_pandas() column_pdf_2 = res_2[column_name].to_pandas() column_1 = column_pdf_1[column_name] for col in column_1: assert op(comp_op[0](col, limit[0]), comp_op[1](col, limit[1])) column_2 = column_pdf_2[column_name] for col in column_2: assert op(op(comp_op[0](col, limit[0]), comp_op[1](col, limit[1])), comp_op[2](col, limit[2]))
def test_isin(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb.to_pandas() tb.set_index(tb.column_names[0], drop=True) pdf.set_index(pdf.columns[0], drop=True, inplace=True) assert tb.index.values.tolist() == pdf.index.values.tolist() compare_values = [4, 1, 10, 100, 150] tb_res_isin = tb.index.isin(compare_values) pdf_res_isin = pdf.index.isin(compare_values) assert tb_res_isin.tolist() == pdf_res_isin.tolist()
def test_conversion_check(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank, size = ctx.get_rank(), ctx.get_world_size() assert size == 2 table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv' table2_path = f'/tmp/user_device_tm_{rank + 1}.csv' assert os.path.exists(table1_path) assert os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) tb3: Table = tb1.distributed_join(table=tb2, join_type='inner', algorithm='sort', left_on=[3], right_on=[0]) # pdf: pd.DataFrame = tb3.to_pandas() npy: np.ndarray = tb3.to_numpy(order='C') # Cylon table rows must be equal to the rows of pandas dataframe extracted from the table # assert tb3.rows == pdf.shape[0] # Cylon table columns must be equal to the columns of pandas dataframe extracted from the table # assert tb3.columns == pdf.shape[1] # Cylon table rows must be equal to the rows of numpy ndarray extracted from the table assert tb3.row_count == npy.shape[0] # Cylon table columns must be equal to the columns of numpy ndarray extracted from the table assert tb3.column_count == npy.shape[1] print( f"Rank[{ctx.get_rank()}]: Table.Rows={tb3.row_count}, Table.Columns={tb3.column_count}, " f"Numpy Array Shape = {npy.shape}") print(f"Array Config Rank[{ctx.get_rank()}], {npy.flags} {npy.dtype}")
def test_unique(): ctx = cn.CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: cn.Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15] tb2 = tb1.unique(columns=['a', 'b'], keep='first') pdf2 = pdf.drop_duplicates(subset=['a', 'b']) tb2.show() sort_col = tb2.sort(3).to_pydict()['d'] assert sort_col == expected_indices_of_sort_col assert pdf2['d'].values.tolist() == sort_col ctx.finalize()
def test_distributed_run(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) table1_path = '/tmp/user_usage_tm_1.csv' table2_path = '/tmp/user_device_tm_1.csv' assert os.path.exists(table1_path) assert os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) configs = {'join_type': 'inner', 'algorithm': 'sort'} tb3: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[3], right_on=[0]) row_count = tb3.row_count column_count = tb3.column_count assert ctx.get_world_size() == 4 assert column_count == 8 rank = ctx.get_rank() if rank == 0: assert row_count == 640 elif rank == 1: assert row_count == 624 elif rank == 2: assert row_count == 592 elif rank == 3: assert row_count == 688 else: raise Exception("Parallelism not supported in this test")
def test_getitem_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb.to_pandas() print(tb) print("-" * 80) print(pdf) tb.set_index('a', drop=True) pdf.set_index('a', drop=True, inplace=True) assert tb.index.values.tolist() == pdf.index.values.tolist() tb_1 = tb['b'] pdf_1 = pdf['b'] print(tb_1.index.values) print(pdf_1.index.values) assert tb_1.index.values.tolist() == pdf_1.index.values.tolist() tb_2 = tb[0:10] pdf_2 = pdf[0:10] print(tb_2.index.values) print(pdf_2.index.values) assert tb_2.index.values.tolist() == pdf_2.index.values.tolist() tb_3 = tb[['c', 'd']] pdf_3 = pdf[['c', 'd']] print(tb_3.index.values) print(pdf_3.index.values) assert tb_3.index.values.tolist() == pdf_3.index.values.tolist()
def test_rl(): ctx: CylonContext = CylonContext(config=None, distributed=False) table1_path = '/tmp/user_usage_tm_1.csv' table2_path = '/tmp/user_usage_tm_2.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print("First Hello World From Rank {}, Size {}".format( ctx.get_rank(), ctx.get_world_size())) tb3: Table = tb1.join(table=tb2, join_type='inner', algorithm='hash', left_on=[0], right_on=[0]) assert tb3.row_count == 458 and tb3.column_count == 8 tb4: Table = tb1.union(tb2) assert tb4.row_count == 240 and tb4.column_count == 4 tb5: Table = tb1.subtract(tb2) assert tb5.row_count == 0 and tb5.column_count == 4 tb6: Table = tb1.intersect(tb2) assert tb6.row_count == 240 and tb6.column_count == 4 ctx.finalize()
def test_arrow_cylon(): ctx: CylonContext = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/user_device_tm_1.csv' assert os.path.exists(table_path) tb: pa.Table = csv.read_csv(table_path) arrow_columns = len(tb.columns) arrow_rows = tb.num_rows tbc = Table.from_arrow(ctx, tb) cylon_rows = tbc.row_count cylon_columns = tbc.column_count assert arrow_columns == cylon_columns assert arrow_rows == cylon_rows ctx.finalize()
def test_setitem_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb.to_pandas() print(tb) print("-" * 80) print(pdf) tb.set_index('a', drop=True) pdf.set_index('a', drop=True, inplace=True) new_data = [i * 10 for i in range(tb.row_count)] new_tb = Table.from_list(ctx, ['new_col'], [new_data]) tb['e'] = new_tb pdf['e'] = pd.DataFrame(new_data) print(tb.index.values) print(pdf.index.values) assert tb.index.values.tolist() == pdf.index.values.tolist()
def test_table_initialization_with_index(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb: Table = read_csv(ctx, table_path, csv_read_options) expected_index = [i for i in range(tb.row_count)] expected_index_1 = [0, 1, 2] print(tb) print(tb.index.values) assert expected_index == tb.index.values.tolist() pd_data = [[1, 2, 3], [4, 5, 6], [6, 7, 8]] cols = ['a', 'b', 'c'] dict_data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [6, 7, 8]} pdf = pd.DataFrame(pd_data, columns=cols) print(pdf) tb_from_pd = Table.from_pandas(ctx, pdf) print(tb_from_pd) assert tb_from_pd.index.values.tolist() == pdf.index.values.tolist() tb_from_list = Table.from_list(ctx, cols, pd_data) print(tb_from_list) print(tb_from_list.index.values) assert expected_index_1 == tb_from_list.index.values.tolist() tb_from_dict = Table.from_pydict(ctx, dict_data) print(tb_from_dict) print(tb_from_dict.index.values) assert expected_index_1 == tb_from_dict.index.values.tolist()