Example #1
0
def test_multicol():
    # cylon
    ctx: CylonContext = CylonContext()

    c1 = np.random.randint(10, size=100)
    c2 = np.random.randint(10, size=100)

    ar1 = pa.array(c1)
    ar2 = pa.array(c2)

    pa_t: pa.Table = pa.Table.from_arrays([ar1, ar2], names=['col1', 'col2'])

    cn_t = cn.Table.from_arrow(ctx, pa_t)

    cn_srt = cn_t.sort(order_by=['col1', 'col2'], ascending=[True, False])

    # pandas

    df = pd.DataFrame({'col1': c1, 'col2': c2}, columns=['col1', 'col2'])

    df = df.sort_values(by=['col1', 'col2'], ascending=[True, False])

    assert cn_srt.to_pandas().values.tolist() == df.values.tolist()
Example #2
0
def test_arrow_cylon():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table_path = '/tmp/user_device_tm_1.csv'

    assert os.path.exists(table_path)

    tb: pa.Table = csv.read_csv(table_path)

    arrow_columns = len(tb.columns)
    arrow_rows = tb.num_rows

    tbc = Table.from_arrow(ctx, tb)

    cylon_rows = tbc.row_count
    cylon_columns = tbc.column_count

    assert arrow_columns == cylon_columns
    assert arrow_rows == cylon_rows

    ctx.finalize()
def test_setitem_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb.to_pandas()

    print(tb)
    print("-" * 80)
    print(pdf)

    tb.set_index('a', drop=True)
    pdf.set_index('a', drop=True, inplace=True)

    new_data = [i * 10 for i in range(tb.row_count)]
    new_tb = Table.from_list(ctx, ['new_col'], [new_data])
    tb['e'] = new_tb
    pdf['e'] = pd.DataFrame(new_data)

    print(tb.index.values)
    print(pdf.index.values)

    assert tb.index.values.tolist() == pdf.index.values.tolist()
Example #4
0
def test_dropna():
    import numpy as np
    columns = ['col1', 'col2', 'col3']
    dtype = 'int32'
    datum_1 = [[1.0, 2.0, 3.0, 4.0, 5.0, None],
               [None, 7.0, 8.0, 9.0, 10.0, 11.0],
               [12.0, 13.0, 14.0, 15.0, 16.0, 17.0]]
    datum_2 = [[1.0, 2.0, 3.0, 4.0, 5.0, None],
               [None, 7.0, 8.0, 9.0, 10.0, None],
               [12.0, 13.0, None, 15.0, 16.0, 17.0]]

    dataset = [datum_1, datum_2]
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    ## axis=0 => column-wise
    inplace_ops = [True, False]
    hows = ['any', 'all']
    axiz = [0, 1]
    for inplace in inplace_ops:
        for how in hows:
            for axis in axiz:
                for data in dataset:
                    cn_tb = cn.Table.from_list(ctx, columns, data)
                    df = cn_tb.to_pandas()
                    if inplace:
                        cn_tb.dropna(axis=axis, how=how, inplace=inplace)
                        df.dropna(axis=1 - axis, how=how, inplace=inplace)
                    else:
                        cn_tb = cn_tb.dropna(axis=axis,
                                             how=how,
                                             inplace=inplace)
                        df = df.dropna(axis=1 - axis, how=how, inplace=inplace)

                    pdf_values = df.fillna(0).values.flatten().tolist()
                    cn_tb_values = cn_tb.to_pandas().fillna(
                        0).values.flatten().tolist()
                    assert pdf_values == cn_tb_values
Example #5
0
def isin_op(num_rows: int, num_cols: int, filter_size: int, unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "arrow")
    df = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor)

    ct = Table.from_pandas(ctx, df)

    cmp_data = np.random.randn(filter_size)

    cmp_data = cmp_data.tolist()

    pandas_time = time.time()
    df.isin(cmp_data)
    pandas_time = time.time() - pandas_time

    cylon_time = time.time()
    ct.isin(cmp_data)
    cylon_time = time.time() - cylon_time

    pandas_eval_time = time.time()
    pd.eval('df.isin(cmp_data)')
    pandas_eval_time = time.time() - pandas_eval_time

    return pandas_time, cylon_time, pandas_eval_time
Example #6
0
def test_df_perf_iterrows():
    ctx = CylonContext(config=None, distributed=False)

    dataset = []
    num_rows = 100_000
    num_columns = 2

    data = np.random.randn(num_rows)

    pdf = pd.DataFrame({'data{}'.format(i): data for i in range(num_columns)})

    tb1 = Table.from_pandas(ctx, pdf)

    tb1.set_index(tb1.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)

    print(pdf)
    t1 = time.time()
    for idx, row in pdf.iterrows():
        idx = idx
        row = row
    t2 = time.time()
    dict = tb1.to_pydict(with_index=True)
    indices = tb1.index.index_values
    rows = []
    for index in indices:
        row = []
        for col in dict:
            row.append(dict[col][index])
        rows.append(row)

    for index, row in zip(indices, rows):
        index = index
        row = row
    t3 = time.time()
    print(t2 - t1, t3 - t2)
Example #7
0
def test_isin():
    dict_elems = {'num_legs': [2, 4], 'num_wings': [2, 0]}

    indices = ['falcon', 'dog']
    indices_cmp = ['spider', 'falcon']
    df = pd.DataFrame(dict_elems, index=indices)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb = cn.Table.from_pydict(ctx, dict_elems)
    cn_tb.set_index(indices, IndexingType.LINEAR)

    ########

    list_comp_values = [2, 0]
    dict_comp_values = {'num_legs': [2, 0]}
    dict_comp_elements = {'num_legs': [8, 2], 'num_wings': [0, 2]}
    cn_tb_other = cn.Table.from_pydict(ctx, dict_comp_elements)
    cn_tb_other.set_index(indices_cmp, IndexingType.LINEAR)
    other = pd.DataFrame(dict_comp_elements, index=indices_cmp)

    comp_values = [list_comp_values, dict_comp_values]

    for comp_val in comp_values:
        assert df.isin(comp_val).values.tolist() == cn_tb.isin(
            comp_val).to_pandas().values.tolist()
def test_table_initialization_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    expected_index = [i for i in range(tb.row_count)]
    expected_index_1 = [0, 1, 2]

    print(tb)
    print(tb.index.values)

    assert expected_index == tb.index.values.tolist()

    pd_data = [[1, 2, 3], [4, 5, 6], [6, 7, 8]]
    cols = ['a', 'b', 'c']
    dict_data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [6, 7, 8]}
    pdf = pd.DataFrame(pd_data, columns=cols)
    print(pdf)

    tb_from_pd = Table.from_pandas(ctx, pdf)
    print(tb_from_pd)

    assert tb_from_pd.index.values.tolist() == pdf.index.values.tolist()

    tb_from_list = Table.from_list(ctx, cols, pd_data)

    print(tb_from_list)
    print(tb_from_list.index.values)

    assert expected_index_1 == tb_from_list.index.values.tolist()

    tb_from_dict = Table.from_pydict(ctx, dict_data)
    print(tb_from_dict)
    print(tb_from_dict.index.values)

    assert expected_index_1 == tb_from_dict.index.values.tolist()
    # create model and move it to GPU with id rank

    model = Network().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()
    if rank == 0:
        print("Training A Dummy Model")
    for t in range(20):
        for x_batch, y_batch in zip(x_train, y_train):
            print(f"Epoch {t}", end='\r')
            prediction = ddp_model(x_batch)
            loss = loss_fn(prediction, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    cleanup()


if __name__ == '__main__':
    ctx: CylonContext = CylonContext('mpi')
    rank = ctx.get_rank()
    world_size = ctx.get_world_size()
    demo_basic(rank=rank, world_size=world_size)
    ctx.finalize()
Example #10
0
from pycylon import Table
from pycylon.csv import csv_reader
from pycylon import CylonContext

ctx: CylonContext = CylonContext(config=None)

tb: Table = csv_reader.read(ctx, '/tmp/user_usage_tm_1.csv', ',')

print("Table Column Names")
print(tb.column_names)

print("Table Schema")
print(tb.schema)

print(tb[0].to_pandas())

print(tb[0:5].to_pandas())

print(tb[2:5].to_pandas())

print(tb[5].to_pandas())

print(tb[7].to_pandas())

tb.show_by_range(0, 4, 0, 4)

print(tb[0:5].to_pandas())

ctx.finalize()

import pyarrow as pa
Example #11
0
from pycylon.csv import csv_reader
from pycylon import Table
from pycylon import CylonContext
import argparse

ctx: CylonContext = CylonContext("mpi")

parser = argparse.ArgumentParser(description='PyCylon Table Conversion')
parser.add_argument('--table1_path', type=str, help='Path to table 1 csv')
parser.add_argument('--table2_path', type=str, help='Path to table 2 csv')

args = parser.parse_args()

tb1: Table = csv_reader.read(ctx, args.table1_path, ',')

tb2: Table = csv_reader.read(ctx, args.table2_path, ',')

configs = {
    'join_type': 'left',
    'algorithm': 'hash',
    'left_col': 0,
    'right_col': 0
}

tb3: Table = tb1.distributed_join(ctx,
                                  table=tb2,
                                  join_type=configs['join_type'],
                                  algorithm=configs['algorithm'],
                                  left_col=configs['left_col'],
                                  right_col=configs['right_col'])
Example #12
0
def test_prefix_process():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table1_path = '/tmp/user_device_tm_1.csv'
    table2_path = '/tmp/user_usage_tm_1.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    configs = {'join_type': 'inner', 'algorithm': 'sort'}

    tb3: Table = tb1.join(
        table=tb2,
        join_type=configs['join_type'],
        algorithm=configs['algorithm'],
        left_on=[0],
        right_on=[3],
        left_prefix="l_",
        right_prefix="r_",
    )

    print(tb3.row_count, tb3.column_count)

    tb4: Table = tb1.join(
        table=tb2,
        join_type=configs['join_type'],
        algorithm=configs['algorithm'],
        left_on=['use_id'],
        right_on=['use_id'],
        left_prefix="l_1_",
        right_prefix="r_1_",
    )

    tb5: Table = tb1.join(
        table=tb2,
        join_type=configs['join_type'],
        algorithm=configs['algorithm'],
        on=['use_id'],
        left_prefix="l_2_",
        right_prefix="r_2_",
    )

    assert tb3.row_count == tb4.row_count == tb5.row_count and tb3.column_count == \
           tb4.column_count == tb5.column_count
    expected_column_names_1 = [
        'l_use_id', 'l_user_id', 'l_platform_version', 'l_use_type_id',
        'r_outgoing_mins_per_month', 'r_outgoing_sms_per_month',
        'r_monthly_mb', 'r_use_id'
    ]

    expected_column_names_2 = [
        'l_1_use_id', 'l_1_user_id', 'l_1_platform_version', 'l_1_use_type_id',
        'r_1_outgoing_mins_per_month', 'r_1_outgoing_sms_per_month',
        'r_1_monthly_mb', 'r_1_use_id'
    ]

    expected_column_names_3 = [
        'l_2_use_id', 'l_2_user_id', 'l_2_platform_version', 'l_2_use_type_id',
        'r_2_outgoing_mins_per_month', 'r_2_outgoing_sms_per_month',
        'r_2_monthly_mb', 'r_2_use_id'
    ]

    assert expected_column_names_1 == tb3.column_names
    assert expected_column_names_2 == tb4.column_names
    assert expected_column_names_3 == tb5.column_names

    ctx.finalize()
Example #13
0
def test_table():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table_path = '/tmp/user_device_tm_1.csv'

    pyarrow_table = pyarrow_read_csv(table_path)

    tb = Table(pyarrow_table, ctx)

    assert isinstance(tb, Table)

    ar_tb2 = tb.to_arrow()

    assert isinstance(ar_tb2, pa.Table)

    tb2 = Table.from_arrow(ctx, pyarrow_table)

    assert tb2.row_count == 272 and tb2.column_count == 4

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb3 = read_csv(ctx, table_path, csv_read_options)

    assert tb3.row_count == 272 and tb3.column_count == 4

    csv_write_options = CSVWriteOptions().with_delimiter(',')

    tb3.to_csv('/tmp/temp_record.csv', csv_write_options)

    tb4 = tb3.sort(1)

    col_names = ['use_id', 'user_id', 'platform_version', 'use_type_id']

    for idx, col in enumerate(col_names):
        assert tb4.column_names[idx] == col

    assert tb4.row_count == 272 and tb4.column_count == 4

    tb5 = tb3.sort('use_type_id')

    assert tb5.row_count == 272 and tb5.column_count == 4

    for idx, col in enumerate(col_names):
        assert tb5.column_names[idx] == col

    tb6 = Table.merge([tb4, tb4])

    assert tb6.row_count == 544 and tb6.column_count == 4

    tb7 = tb6

    assert tb7.row_count == 544 and tb7.column_count == 4

    tb8 = tb3.project([0, 1])

    assert tb8.row_count == 272 and tb8.column_count == 2

    tb9 = tb3.project(['use_id', 'platform_version'])

    assert tb9.row_count == 272 and tb9.column_count == 2

    project_col_names = ['use_id', 'platform_version']

    for idx, col in enumerate(project_col_names):
        assert tb9.column_names[idx] == col

    ctx.finalize()
Example #14
0
# limitations under the License.
##
"""
Install: PyCylon (Follow: https://cylondata.org/docs/)
Run Program: python demo_pytorch.py
"""

import os

import numpy as np
import pandas as pd
from pycylon import CylonContext
from pycylon import Table
from pycylon.csv import csv_reader

ctx: CylonContext = CylonContext(config='mpi')

base_path = "/tmp"

rank = ctx.get_rank()

user_devices_file = os.path.join(base_path, f'user_device_tm_{rank+1}.csv')
user_usage_file = os.path.join(base_path, f'user_usage_tm_{rank+1}.csv')

user_devices_data: Table = csv_reader.read(ctx, user_devices_file, ',')
user_usage_data: Table = csv_reader.read(ctx, user_usage_file, ',')

user_devices_df: pd.DataFrame = user_devices_data.to_pandas()
user_usage_df: pd.DataFrame = user_usage_data.to_pandas()

print(
Example #15
0
from pycylon.csv import csv_reader
from pycylon import Table
from pycylon import CylonContext
import argparse

ctx: CylonContext = CylonContext(config="mpi")

parser = argparse.ArgumentParser(description='PyCylon Table Conversion')
parser.add_argument('--table1_path', type=str, help='Path to table 1 csv')
parser.add_argument('--table2_path', type=str, help='Path to table 2 csv')

args = parser.parse_args()

tb1: Table = csv_reader.read(ctx, args.table1_path, ',')

tb2: Table = csv_reader.read(ctx, args.table2_path, ',')

configs = {
    'join_type': 'inner',
    'algorithm': 'sort',
    'left_col': 0,
    'right_col': 0
}

tb3: Table = tb1.distributed_join(ctx,
                                  table=tb2,
                                  join_type=configs['join_type'],
                                  algorithm=configs['algorithm'],
                                  left_col=configs['left_col'],
                                  right_col=configs['right_col'])
Example #16
0
def test_arrow_index():
    from pycylon.indexing.index import IndexingType
    from pycylon.indexing.index import ArrowLocIndexer

    pdf_float = pd.DataFrame({'a': pd.Series([1, 4, 7, 10, 20, 23, 11]),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'),
                              'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'),
                              'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'),
                              'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121],
                                             dtype='int')})
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    print("Before Indexing")
    print(cn_tb)

    cn_tb.set_index('a', indexing_type, drop_index)

    pdf_float = pdf_float.set_index('a')

    print("After Indexing")
    assert cn_tb.column_names == ['b', 'c', 'd', 'e']

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    print(cn_tb.get_index().values)

    index_array = cn_tb.get_index().get_index_array()

    print(index_array)

    print(index_array.type)

    scalar_value = pa.scalar(10, index_array.type)

    print(scalar_value)

    arrow_loc_indexer = ArrowLocIndexer(IndexingType.LINEAR)
    output1 = arrow_loc_indexer.loc_with_index_range(4, 20, 0, cn_tb)

    print(output1)

    print(output1.get_index().values)

    output2 = arrow_loc_indexer.loc_with_index_range(4, 20, slice(0, 1), cn_tb)

    print(output2)

    print(output2.get_index().values)

    output3 = arrow_loc_indexer.loc_with_index_range(4, 20, [0, 1, 2], cn_tb)

    print(output3)

    print(output3.get_index().values)

    output4 = arrow_loc_indexer.loc_with_indices([4], 0, cn_tb)

    print(output4)

    print(output4.get_index().values)

    output5 = arrow_loc_indexer.loc_with_indices([4, 20], slice(0, 1), cn_tb)

    print(output5)

    print(output5.get_index().values)

    output6 = arrow_loc_indexer.loc_with_indices([4, 20], [0, 1, 2], cn_tb)

    print(output6)

    print(output6.get_index().values)
Example #17
0
def test_iloc_op_mode_1():
    from pycylon.indexing.index import IndexingType
    from pycylon.indexing.index_utils import IndexUtil

    pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'),
                              'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'),
                              'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'),
                              'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121],
                                             dtype='int')})
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    print("Before Indexing")
    print(cn_tb)

    cn_tb.set_index('a', indexing_type, drop_index)

    pdf_float = pdf_float.set_index('a')

    print("After Indexing")
    assert cn_tb.column_names == ['b', 'c', 'd', 'e']

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    iloc_cn_1 = cn_tb.iloc[3:5, 1:3]
    iloc_pd_1 = pdf_float.iloc[3:5, 1:3]

    print(iloc_cn_1)
    print(iloc_pd_1)

    assert iloc_pd_1.values.tolist() == iloc_cn_1.to_pandas().values.tolist()

    iloc_cn_2 = cn_tb.iloc[3:5, 1:]
    iloc_pd_2 = pdf_float.iloc[3:5, 1:]

    print(iloc_cn_2)
    print(iloc_pd_2)

    assert iloc_pd_2.values.tolist() == iloc_cn_2.to_pandas().values.tolist()

    iloc_cn_3 = cn_tb.iloc[3:, 1:]
    iloc_pd_3 = pdf_float.iloc[3:, 1:]

    assert iloc_pd_3.values.tolist() == iloc_cn_3.to_pandas().values.tolist()

    iloc_cn_4 = cn_tb.iloc[:3, 1:]
    iloc_pd_4 = pdf_float.iloc[:3, 1:]

    print(iloc_cn_4)
    print(iloc_pd_4)

    assert iloc_pd_4.values.tolist() == iloc_cn_4.to_pandas().values.tolist()

    iloc_cn_5 = cn_tb.iloc[:, :]
    iloc_pd_5 = pdf_float.iloc[:, :]

    assert iloc_pd_5.values.tolist() == iloc_cn_5.to_pandas().values.tolist()

    iloc_cn_6 = cn_tb.iloc[[0, 2, 3], :]
    iloc_pd_6 = pdf_float.iloc[[0, 2, 3], :]

    assert iloc_pd_6.values.tolist() == iloc_cn_6.to_pandas().values.tolist()
Example #18
0
def test_loc_op_mode_2():
    from pycylon.indexing.index import IndexingType

    pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'),
                              'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'),
                              'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'),
                              'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121],
                                             dtype='int')})
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    print("Before Indexing")
    print(cn_tb)

    cn_tb.set_index('a', indexing_type, drop_index)

    pdf_float = pdf_float.set_index('a')

    print("After Indexing")
    assert cn_tb.column_names == ['b', 'c', 'd', 'e']

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    loc_cn_1 = cn_tb.loc["7":"20", 'c':'e']
    loc_pd_1 = pdf_float.loc["7":"20", 'c':'e']

    assert loc_pd_1.values.tolist() == loc_cn_1.to_pandas().values.tolist()
    assert loc_cn_1.get_index().get_index_array() == pa.array(loc_pd_1.index)
    # assert loc_cn_1.get_arrow_index().get_index_array() == pa.array(loc_pd_1.index)

    loc_cn_2 = cn_tb.loc["7":"20", 'd':]
    loc_pd_2 = pdf_float.loc["7":"20", 'd':]

    assert loc_pd_2.values.tolist() == loc_cn_2.to_pandas().values.tolist()
    assert loc_cn_2.get_index().get_index_array() == pa.array(loc_pd_2.index)
    # assert loc_cn_2.get_arrow_index().get_index_array() == pa.array(loc_pd_2.index)

    loc_cn_3 = cn_tb.loc["7":, 'd':]
    loc_pd_3 = pdf_float.loc["7":, 'd':]

    assert loc_pd_3.values.tolist() == loc_cn_3.to_pandas().values.tolist()
    assert loc_cn_3.get_index().get_index_array() == pa.array(loc_pd_3.index)
    # assert loc_cn_3.get_arrow_index().get_index_array() == pa.array(loc_pd_3.index)

    loc_cn_4 = cn_tb.loc[:"7", 'd':]
    loc_pd_4 = pdf_float.loc[:"7", 'd':]

    assert loc_pd_4.values.tolist() == loc_cn_4.to_pandas().values.tolist()
    assert loc_cn_4.get_index().get_index_array() == pa.array(loc_pd_4.index)
    # assert loc_cn_4.get_arrow_index().get_index_array() == pa.array(loc_pd_4.index)

    loc_cn_5 = cn_tb.loc[:, 'd':]
    loc_pd_5 = pdf_float.loc[:, 'd':]

    assert loc_pd_5.values.tolist() == loc_cn_5.to_pandas().values.tolist()
    assert loc_cn_5.get_index().get_index_array() == pa.array(loc_pd_5.index)
    # assert loc_cn_5.get_arrow_index().get_index_array() == pa.array(loc_pd_5.index)

    loc_cn_6 = cn_tb.loc[["7", "20"], 'd':]
    loc_pd_6 = pdf_float.loc[["7", "20"], 'd':]

    assert loc_pd_6.values.tolist() == loc_cn_6.to_pandas().values.tolist()
    assert loc_cn_6.get_index().get_index_array() == pa.array(loc_pd_6.index)
Example #19
0
def test_table_is_in_dev():
    from typing import List
    from pyarrow.compute import and_
    from pyarrow import compute as a_compute
    col_validity = [False, True]
    # comparison data needs to be broadcasted in such a manner that it equals to the number of
    # rows in the table
    cols = 2
    rows = 4
    col_names_ = ['col-1', 'col-2']
    comp_col_names_ = ['col-11', 'col-2']
    row_indices_ = ['1', '2', '3', '4']
    row_indices_cmp_ = ['1', '21', '3', '41']

    data = [[2, 4, 3, 1], [0, 2, 1, 2]]
    cmp_data = [[12, 4, 13, 1], [10, 2, 12, 3]]
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    tb = cn.Table.from_list(ctx, col_names_, data)
    tb.set_index(row_indices_)
    tb_cmp = cn.Table.from_list(ctx, comp_col_names_, cmp_data)
    tb_cmp.set_index(row_indices_cmp_)

    def compare_array_like_values(l_org_ar, l_cmp_ar, skip_null=True):
        return a_compute.is_in(l_org_ar,
                               value_set=l_cmp_ar,
                               skip_nulls=skip_null)

    def broadcast(ar, broadcast_coefficient=1):
        bcast_ar = []
        for elem in ar:
            bcast_elems = []
            for i in range(broadcast_coefficient):
                bcast_elems.append(elem.as_py())
            bcast_ar.append(pa.array(bcast_elems))
        return bcast_ar

    def compare_two_arrays(l_ar, r_ar):
        return a_compute.and_(l_ar, r_ar)

    def compare_row_and_column(row, columns):
        comp_res = []
        for column in columns:
            print(type(column), type(row))
            comp_res.append(compare_two_arrays(l_ar=row, r_ar=column))
        return comp_res

    def populate_column_with_single_value(value, row_count):
        column_values = []
        for i in range(row_count):
            column_values.append(value)
        return column_values

    def tb_compare_values(tb, tb_cmp, skip_null=True):

        col_names = tb.column_names
        comp_col_names = tb_cmp.column_names

        row_indices = tb.index.index_values
        row_indices_cmp = tb_cmp.index.index_values

        col_comp_res = compare_array_like_values(
            l_org_ar=pa.array(col_names), l_cmp_ar=pa.array(comp_col_names))
        row_comp_res = compare_array_like_values(
            l_org_ar=pa.array(row_indices), l_cmp_ar=pa.array(row_indices_cmp))
        bcast_col_comp_res = broadcast(ar=col_comp_res,
                                       broadcast_coefficient=rows)
        row_col_comp = compare_row_and_column(row=row_comp_res,
                                              columns=bcast_col_comp_res)

        tb_ar = tb.to_arrow().combine_chunks()
        tb_cmp_ar = tb_cmp.to_arrow().combine_chunks()

        col_data_map = {}
        for col_name, validity, row_col_validity in zip(
                col_names, col_comp_res, row_col_comp):
            if validity.as_py():
                chunk_ar_org = tb_ar.column(col_name)
                chunk_ar_cmp = tb_cmp_ar.column(col_name)
                data_cmp_res = a_compute.is_in(chunk_ar_org,
                                               value_set=chunk_ar_cmp,
                                               skip_nulls=skip_null)
                print(data_cmp_res, row_col_validity)
                col_data_map[col_name] = compare_two_arrays(
                    data_cmp_res, row_col_validity)
            else:
                col_data_map[col_name] = pa.array(
                    populate_column_with_single_value(False, tb.row_count))

        is_in_values = list(col_data_map.values())
        return cn.Table.from_list(tb.context, col_names, is_in_values)

    new_tb = tb_compare_values(tb, tb_cmp)
    print(new_tb)
Example #20
0
def test_distributed_ra():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank = ctx.get_rank()
    size = ctx.get_world_size()

    assert size == 4

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)
    assert os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print("First Hello World From Rank {}, Size {}".format(
        ctx.get_rank(), ctx.get_world_size()))

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type='inner',
                                      algorithm='hash',
                                      left_on=[0],
                                      right_on=[0])

    tb4: Table = tb1.distributed_union(tb2)

    tb5: Table = tb1.distributed_subtract(tb2)

    tb6: Table = tb1.distributed_intersect(tb2)

    ctx.barrier()

    join_row_count = tb3.row_count
    join_column_count = tb3.column_count

    subtract_row_count = tb5.row_count
    subtract_column_count = tb5.column_count

    union_row_count = tb4.row_count
    union_column_count = tb4.column_count

    intersect_row_count = tb6.row_count
    intersect_column_count = tb6.column_count

    if rank == 0:
        assert join_row_count == 1424 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 112 and union_column_count == 4
        assert intersect_row_count == 112 and intersect_column_count == 4

    if rank == 1:
        assert join_row_count == 1648 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 122 and union_column_count == 4
        assert intersect_row_count == 122 and intersect_column_count == 4

    if rank == 2:
        assert join_row_count == 2704 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 102 and union_column_count == 4
        assert intersect_row_count == 102 and intersect_column_count == 4

    if rank == 3:
        assert join_row_count == 1552 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 144 and union_column_count == 4
        assert intersect_row_count == 144 and intersect_column_count == 4
def test_concat_op():
    from pycylon.net import MPIConfig
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)
    columns = ['c1', 'c2', 'c3']
    dataset_1 = [[1, 2, 3, 4, 5], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]]
    dataset_2 = [[1, 20, 3, 4, 50], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]]
    dataset_3 = [[1, 20, 3, 40, 50, 60], [21, 31, 41, 51, 50, 70],
                 [32, 42, 52, 62, 72, 82]]

    tb1 = Table.from_list(ctx, columns, dataset_1)
    tb1 = tb1.add_prefix('d1_')

    tb2 = Table.from_list(ctx, columns, dataset_2)
    tb2 = tb2.add_prefix('d2_')

    tb3 = Table.from_list(ctx, columns, dataset_3)
    tb3 = tb3.add_prefix('d3_')

    tb4 = Table.from_list(ctx, columns, dataset_3)
    tb4 = tb4.add_prefix('d1_')

    pdf1 = tb1.to_pandas()
    pdf2 = tb2.to_pandas()
    pdf3 = tb3.to_pandas()
    pdf4 = tb4.to_pandas()

    print(tb1)
    print("-" * 80)
    print(tb2)

    tb1.set_index(tb1.column_names[0], drop=True)
    tb2.set_index(tb2.column_names[0], drop=True)
    tb3.set_index(tb3.column_names[0], drop=True)
    tb4.set_index(tb4.column_names[0], drop=True)

    print("*" * 80)
    print("Indexed table")
    print(tb1)
    print("*" * 80)

    pdf1.set_index(pdf1.columns[0], drop=True, inplace=True)
    pdf2.set_index(pdf2.columns[0], drop=True, inplace=True)
    pdf3.set_index(pdf3.columns[0], drop=True, inplace=True)
    pdf4.set_index(pdf4.columns[0], drop=True, inplace=True)

    print("=" * 80)
    print("axis=1")
    print("=" * 80)
    res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=1)
    print(res_pdf_1)
    print("-" * 80)
    tables = [tb1, tb2]
    tb1_index_values = tb1.index.index_values
    tb2_index_values = tb2.index.index_values
    res_tb_1 = Table.concat(tables, join='inner', axis=1)
    print(res_tb_1)
    print("-" * 80)
    res_pdf_2 = pd.concat([pdf1, pdf2], join='inner', axis=1)
    print(res_pdf_2)
    assert res_pdf_2.values.tolist() == res_tb_1.to_pandas().values.tolist()
    assert res_tb_1.index.index_values == res_pdf_2.index.values.tolist()
    print("-" * 80)
    print(tb1.to_arrow())
    print(tb2.to_arrow())
    print(tb1.index.index_values, tb1_index_values)
    print(tb2.index.index_values, tb2_index_values)
    assert tb1.index.index_values.sort() == tb1_index_values.sort()
    assert tb2.index.index_values.sort() == tb2_index_values.sort()
    print("=" * 80)
    print("axis=0")
    print("=" * 80)
    res_pdf_3 = pd.concat([pdf1, pdf4], join='inner', axis=0)
    print(tb1.column_names, tb4.column_names)
    res_tb_2 = Table.concat([tb1, tb4], join='inner', axis=0)
    print(res_tb_2)
    print(res_tb_2.index.index_values)
    print(res_pdf_3)
    print(res_pdf_3.index.values.tolist())
    assert res_pdf_3.values.tolist() == res_tb_2.to_pandas().values.tolist()
    assert res_tb_2.index.index_values == res_pdf_3.index.values.tolist()
def test_concat_table():
    """
        For Cylon concat operation:

        We can check for indexing column if default the index array contains [0,num_records-1)
        If indexed, the indexed column will be compared.

        We can use existing join ops.

        Algorithm
        =========

        axis=1 (regular join op considering a column)
        ----------------------------------------------

        1. If indexed or not, do a reset_index op (which will add the new column as 'index' in both
        tables)
        2. Do the regular join by considering the 'index' column
        3. Set the index by 'index' in the resultant table

        axis=0 (stacking tables or similar to merge function)
        -----------------------------------------------------
        assert: column count must match
        the two tables are stacked upon each other in order
        The index is created by concatenating two indices
    """
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    columns = ['c1', 'c2', 'c3']
    dataset_1 = [[1, 2, 3, 4, 5], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]]
    dataset_2 = [[1, 20, 3, 4, 50], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]]
    dataset_3 = [[1, 20, 3, 40, 50, 60], [21, 31, 41, 51, 50, 70],
                 [32, 42, 52, 62, 72, 82]]

    tb1 = Table.from_list(ctx, columns, dataset_1)
    tb1 = tb1.add_prefix('d1_')

    tb2 = Table.from_list(ctx, columns, dataset_2)
    tb2 = tb2.add_prefix('d2_')

    tb3 = Table.from_list(ctx, columns, dataset_3)
    tb3 = tb3.add_prefix('d3_')

    tb4 = Table.from_list(ctx, columns, dataset_3)
    tb4 = tb4.add_prefix('d1_')

    pdf1 = tb1.to_pandas()
    pdf2 = tb2.to_pandas()
    pdf3 = tb3.to_pandas()
    pdf4 = tb4.to_pandas()

    print(tb1)
    print("-" * 80)
    print(tb2)

    tb1.set_index(tb1.column_names[0], drop=True)
    tb2.set_index(tb2.column_names[0], drop=True)
    tb3.set_index(tb3.column_names[0], drop=True)

    print("*" * 80)
    print("Indexed table")
    print(tb1)
    print("*" * 80)
    print("Reset_Index table")
    tb1.reset_index()
    print(tb1)
    print("*" * 80)

    pdf1.set_index(pdf1.columns[0], drop=True, inplace=True)
    pdf2.set_index(pdf2.columns[0], drop=True, inplace=True)
    pdf3.set_index(pdf3.columns[0], drop=True, inplace=True)

    print("=" * 80)
    print("axis=1")
    print("=" * 80)
    res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=1)
    print(res_pdf_1)
    print("-" * 80)
    res_pdf_2 = pd.concat([pdf1, pdf3], join='inner', axis=1)
    print(res_pdf_2)
    print("-" * 80)

    print("=" * 80)
    print("axis=0")
    print("=" * 80)
    res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=0)
    print(res_pdf_1)
    print("-" * 80)
    res_pdf_2 = pd.concat([pdf1, pdf3], join='inner', axis=0)
    print(res_pdf_2)
    print("-" * 80)
    res_pdf_3 = pd.concat([pdf1, pdf4], join='inner', axis=0)
    print(res_pdf_3)
    print("-" * 80)
    print("Multi Table Concat 1")
    res_pdf_4 = pd.concat([pdf1, pdf2, pdf3], join='inner', axis=1)
    print(res_pdf_4)
    print("Multi Table Concat 2")
    res_pdf_5 = pd.concat([pdf2, pdf3, pdf1], join='inner', axis=1)
    print(res_pdf_5)
def test_math_i_ops_for_scalar():
    """
    TODO: Enhance Test case and functionality
        Check the following case : https://github.com/cylondata/cylon/issues/229
    >>> from operator import __iadd__
    >>> assert __iadd__(cylon_table, value) == (cylon_table += value)
    >>> Failure ...
    """
    npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50],
                    [12.2, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    cn_tb_1 = cn_tb
    pdf_1 = pdf
    # test column addition

    cn_tb_1['0'] += 2
    pdf_1[0] += 2

    assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist()

    cn_tb_1['0'] -= 2
    pdf_1[0] -= 2

    assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist()

    cn_tb_1['0'] *= 2
    pdf_1[0] *= 2

    assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist()

    cn_tb_1['0'] /= 2
    pdf_1[0] /= 2

    assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist()

    # test table division
    cn_tb_2 = cn_tb
    pdf_2 = pdf

    cn_tb_2 += 2
    pdf += 2

    assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()

    cn_tb_2 -= 2
    pdf -= 2

    assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()

    cn_tb_2 *= 2
    pdf *= 2

    assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()

    cn_tb_2 /= 2
    pdf /= 2

    assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

import numpy as np
import pyarrow as pa
import pandas as pd
import pycylon as cn
from pycylon import CylonContext

ctx: CylonContext = CylonContext(config=None, distributed=False)

columns = 2

data1 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
data2 = np.array([10, 11, 12, 13, 14, 15], dtype=np.float32)

nd_array_list = [data1, data2]

ar_array: pa.array = pa.array(nd_array_list)

ar_table: pa.Table = pa.Table.from_arrays(nd_array_list, names=['x0', 'x1'])

print(ar_table)

ar1 = pa.array([1, 2, 3, 4])
Example #25
0
def test_aggregate():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    columns = 2

    data1 = np.array([0, 1, 2, 3, 4, 5], dtype=np.int32)
    data2 = np.array([10, 11, 12, 13, 14, 15], dtype=np.float32)

    nd_array_list = [data1, data2]

    ar_array: pa.array = pa.array(nd_array_list)

    ar_table: pa.Table = pa.Table.from_arrays(nd_array_list,
                                              names=['x0', 'x1'])

    ar1 = pa.array([1, 2, 3, 4])
    ar2 = pa.array(['a', 'b', 'c', 'd'])

    ar_tb2: pa.Table = pa.Table.from_arrays([ar1, ar2], names=['col1', 'col2'])

    assert isinstance(ar_tb2, pa.Table)

    col_names = ['col1', 'col2']

    cn_tb1 = cn.Table.from_numpy(ctx, col_names, nd_array_list)

    assert cn_tb1.row_count == data1.shape[0] and cn_tb1.column_count == len(
        nd_array_list)

    data_list = [[1, 2, 3, 4], ['p', 'q', 'r', 's']]

    cn_tb2 = cn.Table.from_list(ctx, col_names, data_list)

    assert cn_tb2.row_count == len(
        data_list[0]) and cn_tb2.column_count == len(data_list)

    dict1 = {'col1': [1, 2], 'col2': ['a', 'b']}

    ar_tb3: pa.Table = pa.Table.from_pydict(dict1)

    cn_tb3: cn.Table = cn.Table.from_pydict(ctx, dict1)

    assert cn_tb3.row_count == len(
        dict1['col1']) and cn_tb3.column_count == len(dict1)

    pdf = pd.DataFrame(dict1)

    # df, Schema schema=None, preserve_index=None, nthreads=None, columns=None, bool safe=True

    cn_tb4: cn.Table = cn.Table.from_pandas(ctx, pdf)

    assert cn_tb4.row_count == len(
        dict1['col1']) and cn_tb4.column_count == len(dict1)

    dict2 = {'col1': [1, 2, 3], 'col2': [2, 4, 3]}

    cn_tb5: cn.Table = cn.Table.from_pydict(ctx, dict2)

    assert cn_tb5.row_count == len(
        dict2['col1']) and cn_tb5.column_count == len(dict2)

    npy = cn_tb5.to_numpy()

    assert npy.shape == (len(dict2['col1']), len(dict2))

    dict3 = cn_tb5.to_pydict()

    assert dict3 == dict2

    for key1, key2 in zip(dict3.keys(), cn_tb5.column_names):
        assert key1 == key2

    assert cn_tb5.to_numpy().shape == (len(dict2['col1']), len(dict2))

    ## Aggregate Sum

    cn_tb6 = cn_tb5.sum('col1')

    assert cn_tb6.to_numpy()[0][0] == sum(dict2['col1'])

    cn_tb7 = cn_tb5.sum(0)

    assert cn_tb7.to_numpy()[0][0] == sum(dict2['col1'])

    ## Aggregate Count

    cn_tb8 = cn_tb5.count('col1')

    assert cn_tb8.to_numpy()[0][0] == len(dict2['col1'])

    cn_tb9 = cn_tb5.count(0)

    assert cn_tb9.to_numpy()[0][0] == len(dict2['col1'])

    ## Aggregate Min

    cn_tb10 = cn_tb5.min('col1')

    assert cn_tb10.to_numpy()[0][0] == min(dict2['col1'])

    cn_tb11 = cn_tb5.min(0)

    assert cn_tb11.to_numpy()[0][0] == min(dict2['col1'])

    ## Aggregate Max

    cn_tb12 = cn_tb5.max('col1')

    assert cn_tb12.to_numpy()[0][0] == max(dict2['col1'])

    cn_tb13 = cn_tb5.max(0)

    assert cn_tb13.to_numpy()[0][0] == max(dict2['col1'])

    from pycylon.data.aggregates import AggregationOp

    op1 = AggregationOp.SUM

    assert (op1 == AggregationOp.SUM)

    df = pd.DataFrame({
        'AnimalId': [1, 1, 2, 2, 3, 4, 4, 3],
        'Max Speed': [380., 370., 24., 26., 23.1, 300.1, 310.2, 25.2]
    })

    ar_tb_gb = pa.Table.from_pandas(df)

    assert isinstance(ar_tb_gb, pa.Table)

    cn_tb_gb = cn.Table.from_arrow(ctx, ar_tb_gb)

    assert isinstance(cn_tb_gb, cn.Table)

    pdf1 = df.groupby(['AnimalId']).sum()

    cn_tb_gb_res = cn_tb_gb.groupby(0, [1], [AggregationOp.SUM]).sort(0)

    for val1, val2 in zip(cn_tb_gb_res.to_pydict()['Max Speed'],
                          pdf1.to_dict()['Max Speed'].values()):
        assert val1 == val2

    cn_tb_gb_res1 = cn_tb_gb.groupby(0, ['Max Speed'],
                                     [AggregationOp.SUM]).sort(0)

    for val1, val2 in zip(cn_tb_gb_res1.to_pydict()['Max Speed'],
                          pdf1.to_dict()['Max Speed'].values()):
        assert val1 == val2

    pdf2 = df.groupby(['AnimalId']).min()

    cn_tb_gb_res2 = cn_tb_gb.groupby(0, ['Max Speed'],
                                     [AggregationOp.MIN]).sort(0)

    for val1, val2 in zip(cn_tb_gb_res2.to_pydict()['Max Speed'],
                          pdf2.to_dict()['Max Speed'].values()):
        assert val1 == val2

    pdf3 = df.groupby(['AnimalId']).max()

    cn_tb_gb_res3 = cn_tb_gb.groupby(0, ['Max Speed'],
                                     [AggregationOp.MAX]).sort(0)

    for val1, val2 in zip(cn_tb_gb_res3.to_pydict()['Max Speed'],
                          pdf3.to_dict()['Max Speed'].values()):
        assert val1 == val2