コード例 #1
0
def shuffle():
    mpi_config = MPIConfig()

    ctx = CylonContext(config=mpi_config, distributed=True)
    rows = 5
    tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range(
        rows)], 'c3': [i * 3 for i in range(rows)]})

    tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows),
                                                           np.random.random(size=rows),
                                                           np.random.random(size=rows)])

    print(tb.shape)

    tb_shuffle = tb.shuffle(['c1'])

    tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all')

    print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape)

    from pycylon.io import CSVWriteOptions

    csv_write_options = CSVWriteOptions().with_delimiter(',')
    #
    # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options)

    ctx.finalize()
コード例 #2
0
def test_unique():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15]

    print("Original Data")
    print(pdf)

    tb2 = tb1['b'].unique()
    pdf2 = pdf['b'].unique()
    tb2.show()

    print("Unique Pdf")
    print(pdf2)
    print(type(pdf2))

    print("Unique Cylon")
    print(tb2)

    tb3_list = list(tb2.to_pydict().items())[0][1]
    pdf3_list = pdf2.tolist()

    assert tb3_list == pdf3_list

    set_pdf4 = set(pdf2)
    set_tb4 = set(tb3_list)

    assert set_tb4 == set_pdf4

    ctx.finalize()