def shuffle(): mpi_config = MPIConfig() ctx = CylonContext(config=mpi_config, distributed=True) rows = 5 tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range( rows)], 'c3': [i * 3 for i in range(rows)]}) tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows), np.random.random(size=rows), np.random.random(size=rows)]) print(tb.shape) tb_shuffle = tb.shuffle(['c1']) tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all') print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape) from pycylon.io import CSVWriteOptions csv_write_options = CSVWriteOptions().with_delimiter(',') # # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options) ctx.finalize()
def test_unique(): ctx = CylonContext(config=None, distributed=False) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table_path = '/tmp/duplicate_data_0.csv' tb1: Table = read_csv(ctx, table_path, csv_read_options) pdf: pd.DataFrame = tb1.to_pandas() expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15] print("Original Data") print(pdf) tb2 = tb1['b'].unique() pdf2 = pdf['b'].unique() tb2.show() print("Unique Pdf") print(pdf2) print(type(pdf2)) print("Unique Cylon") print(tb2) tb3_list = list(tb2.to_pydict().items())[0][1] pdf3_list = pdf2.tolist() assert tb3_list == pdf3_list set_pdf4 = set(pdf2) set_tb4 = set(tb3_list) assert set_tb4 == set_pdf4 ctx.finalize()