Beispiel #1
0
def load_data_to_cn_tables():
    # TODO : add an API endpoint
    # Table.from_csv('file.csv')
    tb_train: Table = csv_reader.read(train_file_path, delimiter)

    tb_test: Table = csv_reader.read(test_file_path, delimiter)
    return tb_train, tb_test
def multi_process(args):
    ctx: CylonContext = CylonContext(config='mpi')

    tb1: Table = csv_reader.read(ctx, args.table1_path, ',')

    tb2: Table = csv_reader.read(ctx, args.table2_path, ',')

    print(tb1.column_names)
    print(tb2.column_names)

    configs = {
        'join_type': 'inner',
        'algorithm': 'sort',
        'left_col': 0,
        'right_col': 0
    }

    tb3: Table = tb1.distributed_join(ctx,
                                      table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[0],
                                      right_on=[0])

    tb3.show()

    tb4: Table = tb1.distributed_join(ctx,
                                      table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=['A'],
                                      right_on=['A'])

    tb4.show()

    tb4: Table = tb1.distributed_join(ctx,
                                      table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=['A'])

    tb4.show()

    tb5: Table = tb1.distributed_join(ctx,
                                      table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=[0])

    tb5.show()

    ctx.finalize()
def demo_basic(rank, world_size):
    print(f"Simple Batch Train => [{hostname}]Demo DDP Rank {rank}")
    setup(rank=rank, world_size=world_size)

    base_path = "/tmp"

    user_devices_file = os.path.join(base_path,
                                     f'user_device_tm_{rank + 1}.csv')
    user_usage_file = os.path.join(base_path, f'user_usage_tm_{rank + 1}.csv')

    user_devices_data: Table = csv_reader.read(ctx, user_devices_file, ',')
    user_usage_data: Table = csv_reader.read(ctx, user_usage_file, ',')

    print(
        f"User Devices Data Rows:{user_devices_data.rows}, Columns: {user_devices_data.columns}"
    )
    print(
        f"User Usage Data Rows:{user_usage_data.rows}, Columns: {user_usage_data.columns}"
    )

    print("--------------------------------")
    print("Before Join")
    print("--------------------------------")
    user_devices_data.show_by_range(1, 5, 0, 4)
    print("-------------------------------------")
    user_usage_data.show_by_range(1, 5, 0, 4)

    new_tb: Table = user_devices_data.join(ctx, user_usage_data, 'inner',
                                           'sort', 0, 3)
    print("----------------------")
    print("New Table After Join (5 Records)")
    new_tb.show_by_range(0, 5, 0, 8)
    print("----------------------")

    data_ar: np.ndarray = new_tb.to_numpy()

    data_features: np.ndarray = data_ar[:, 2:6]
    data_learner: np.ndarray = data_ar[:, 6:7]

    x_train, y_train = data_features[0:100], data_learner[0:100]
    x_test, y_test = data_features[100:], data_learner[100:]

    x_train = np.asarray(x_train, dtype=np.float32)
    y_train = np.asarray(y_train, dtype=np.float32)
    x_test = np.asarray(x_test, dtype=np.float32)
    y_test = np.asarray(y_test, dtype=np.float32)

    x_train = torch.from_numpy(x_train).to(rank)
    y_train = torch.from_numpy(y_train).to(rank)
    x_test = torch.from_numpy(x_test).to(rank)
    y_test = torch.from_numpy(y_test).to(rank)

    # create model and move it to GPU with id rank

    model = Network().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()
    if rank == 0:
        print("Training A Dummy Model")
    for t in range(20):
        for x_batch, y_batch in zip(x_train, y_train):
            print(f"Epoch {t}", end='\r')
            prediction = ddp_model(x_batch)
            loss = loss_fn(prediction, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    cleanup()
Beispiel #4
0
from pycylon import Table
from pycylon.csv import csv_reader
from pycylon import CylonContext

ctx: CylonContext = CylonContext(config=None)

tb: Table = csv_reader.read(ctx, '/tmp/user_usage_tm_1.csv', ',')

print("Table Column Names")
print(tb.column_names)

print("Table Schema")
print(tb.schema)

print(tb[0].to_pandas())

print(tb[0:5].to_pandas())

print(tb[2:5].to_pandas())

print(tb[5].to_pandas())

print(tb[7].to_pandas())

tb.show_by_range(0, 4, 0, 4)

print(tb[0:5].to_pandas())

ctx.finalize()

import pyarrow as pa
Beispiel #5
0
from pycylon.csv import csv_reader
from pycylon import Table
from pycylon import CylonContext
import argparse

ctx: CylonContext = CylonContext("mpi")

parser = argparse.ArgumentParser(description='PyCylon Table Conversion')
parser.add_argument('--table1_path', type=str, help='Path to table 1 csv')
parser.add_argument('--table2_path', type=str, help='Path to table 2 csv')

args = parser.parse_args()

tb1: Table = csv_reader.read(ctx, args.table1_path, ',')

tb2: Table = csv_reader.read(ctx, args.table2_path, ',')

configs = {
    'join_type': 'left',
    'algorithm': 'hash',
    'left_col': 0,
    'right_col': 0
}

tb3: Table = tb1.distributed_join(ctx,
                                  table=tb2,
                                  join_type=configs['join_type'],
                                  algorithm=configs['algorithm'],
                                  left_col=configs['left_col'],
                                  right_col=configs['right_col'])
Beispiel #6
0
import numpy as np
import pandas as pd
from pycylon import CylonContext
from pycylon import Table
from pycylon.csv import csv_reader

ctx: CylonContext = CylonContext(config='mpi')

base_path = "/tmp"

rank = ctx.get_rank()

user_devices_file = os.path.join(base_path, f'user_device_tm_{rank+1}.csv')
user_usage_file = os.path.join(base_path, f'user_usage_tm_{rank+1}.csv')

user_devices_data: Table = csv_reader.read(ctx, user_devices_file, ',')
user_usage_data: Table = csv_reader.read(ctx, user_usage_file, ',')

user_devices_df: pd.DataFrame = user_devices_data.to_pandas()
user_usage_df: pd.DataFrame = user_usage_data.to_pandas()

print(
    f"User Devices Data Rows:{user_devices_data.rows}, Columns: {user_devices_data.columns}"
)
print(
    f"User Usage Data Rows:{user_usage_data.rows}, Columns: {user_usage_data.columns}"
)

print("--------------------------------")
print("Before Join")
print("--------------------------------")
Beispiel #7
0
# See the License for the specific language governing permissions and
# limitations under the License.
##

from pycylon import CylonContext
from pycylon import Table
from pycylon.csv import csv_reader

if __name__ == "__main__":
    ctx: CylonContext = CylonContext("mpi")
    rank = ctx.get_rank() + 1

    csv1 = f"/tmp/user_device_tm_{rank}.csv"
    csv2 = f"/tmp/user_usage_tm_{rank}.csv"

    first_table: Table = csv_reader.read(ctx, csv1, ',')
    second_table: Table = csv_reader.read(ctx, csv2, ',')

    print(f"Table 1 & 2 Rows [{first_table.rows},{second_table.rows}], "
          f"Columns [{first_table.columns},{first_table.columns}]")

    joined_table: Table = first_table.distributed_join(ctx,
                                                       table=second_table,
                                                       join_type="inner",
                                                       algorithm="sort",
                                                       left_col=0,
                                                       right_col=3)

    print(
        f"First table had : {first_table.rows} and Second table had : {second_table.rows}, "
        f"Joined has : {joined_table.rows}")
Beispiel #8
0
os.system(f"mkdir -p {base_dir}; rm -f {base_dir}/*.csv")

csv1: str = os.path.join(base_dir, f"csv1_{srank}.csv")
csv2: str = os.path.join(base_dir, f"csv2_{srank}.csv")

src1 = f"{src_dir}/csv1_{srank}.csv"
src2 = f"{src_dir}/csv2_{srank}.csv"

print("src files ", src1, src2, flush=True)

os.system(f"cp {src1} {csv1}")
os.system(f"cp {src2} {csv2}")

logging.info(f"{srank} Reading tables")

table1: Table = csv_reader.read(ctx, csv1, ',')
table2: Table = csv_reader.read(ctx, csv2, ',')

try:
    logging.info(
        f"Table 1 & 2 Rows [{table1.rows},{table2.rows}], Columns [{table1.columns},{table1.columns}]"
    )
except Exception:
    raise Exception("Something went wrong in loading tables from disk")

logging.info("Inner Join Start")

RunJoin(rank=rank,
        ctx=ctx,
        table1=table1,
        table2=table2,
Beispiel #9
0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##
"""
Run test:

>>> python python/test/test_table.py --table_path /tmp/csv.csv
"""

from pycylon.csv import csv_reader
from pycylon import Table
from pycylon import CylonContext
import argparse

ctx: CylonContext = CylonContext(config=None)

parser = argparse.ArgumentParser(description='PyCylon Table')
parser.add_argument('--table_path', type=str, help='Path to table csv')

args = parser.parse_args()

tb1: Table = csv_reader.read(ctx, args.table_path, ',')

print(f"Cylon Table Rows {tb1.rows}, Columns {tb1.columns}")

ctx.finalize()
def load_data_to_cn_tables():
    tb_train: Table = csv_reader.read(train_file_path, delimiter)
    tb_test: Table = csv_reader.read(test_file_path, delimiter)
    return tb_train, tb_test