コード例 #1
0
def test_table():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table_path = '/tmp/user_device_tm_1.csv'

    pyarrow_table = pyarrow_read_csv(table_path)

    tb = Table(pyarrow_table, ctx)

    assert isinstance(tb, Table)

    ar_tb2 = tb.to_arrow()

    assert isinstance(ar_tb2, pa.Table)

    tb2 = Table.from_arrow(ctx, pyarrow_table)

    assert tb2.row_count == 272 and tb2.column_count == 4

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb3 = read_csv(ctx, table_path, csv_read_options)

    assert tb3.row_count == 272 and tb3.column_count == 4

    csv_write_options = CSVWriteOptions().with_delimiter(',')

    tb3.to_csv('/tmp/temp_record.csv', csv_write_options)

    tb4 = tb3.sort(1)

    col_names = ['use_id', 'user_id', 'platform_version', 'use_type_id']

    for idx, col in enumerate(col_names):
        assert tb4.column_names[idx] == col

    assert tb4.row_count == 272 and tb4.column_count == 4

    tb5 = tb3.sort('use_type_id')

    assert tb5.row_count == 272 and tb5.column_count == 4

    for idx, col in enumerate(col_names):
        assert tb5.column_names[idx] == col

    tb6 = Table.merge([tb4, tb4])

    assert tb6.row_count == 544 and tb6.column_count == 4

    tb7 = tb6

    assert tb7.row_count == 544 and tb7.column_count == 4

    tb8 = tb3.project([0, 1])

    assert tb8.row_count == 272 and tb8.column_count == 2

    tb9 = tb3.project(['use_id', 'platform_version'])

    assert tb9.row_count == 272 and tb9.column_count == 2

    project_col_names = ['use_id', 'platform_version']

    for idx, col in enumerate(project_col_names):
        assert tb9.column_names[idx] == col

    ctx.finalize()
コード例 #2
0
from pycylon import Table
from pycylon import CylonContext

ctx: CylonContext = CylonContext(config=None, distributed=False)
num_rows = 10  #10_000_000
data = np.random.randn(num_rows)

df = pd.DataFrame({'data{}'.format(i): data for i in range(100)})
df['key'] = np.random.randint(0, 100, size=num_rows)
ct = Table.from_pandas(ctx, df)
ct.set_index(range(0, num_rows))

data1 = np.random.randn(num_rows)
df1 = pd.DataFrame({'data{}'.format(i): data for i in range(100)})
df1['key'] = np.random.randint(0, 100, size=num_rows)
ct1 = Table.from_pandas(ctx, df1)
ct1.set_index(range(0, num_rows))

t1 = time.time()
new_df = df.merge(df1, how='outer')
#print(new_df)
t2 = time.time()

#print(ct)
t3 = time.time()
new_ct = Table.merge(ctx, [ct, ct1])
#print(new_ct)
t4 = time.time()

print('pandas :', t2 - t1, 'cylon :', t4 - t3)