def load_data_to_cn_tables(): # TODO : add an API endpoint # Table.from_csv('file.csv') tb_train: Table = csv_reader.read(train_file_path, delimiter) tb_test: Table = csv_reader.read(test_file_path, delimiter) return tb_train, tb_test
def multi_process(args): ctx: CylonContext = CylonContext(config='mpi') tb1: Table = csv_reader.read(ctx, args.table1_path, ',') tb2: Table = csv_reader.read(ctx, args.table2_path, ',') print(tb1.column_names) print(tb2.column_names) configs = { 'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, 'right_col': 0 } tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[0]) tb3.show() tb4: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['A'], right_on=['A']) tb4.show() tb4: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['A']) tb4.show() tb5: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=[0]) tb5.show() ctx.finalize()
def demo_basic(rank, world_size): print(f"Simple Batch Train => [{hostname}]Demo DDP Rank {rank}") setup(rank=rank, world_size=world_size) base_path = "/tmp" user_devices_file = os.path.join(base_path, f'user_device_tm_{rank + 1}.csv') user_usage_file = os.path.join(base_path, f'user_usage_tm_{rank + 1}.csv') user_devices_data: Table = csv_reader.read(ctx, user_devices_file, ',') user_usage_data: Table = csv_reader.read(ctx, user_usage_file, ',') print( f"User Devices Data Rows:{user_devices_data.rows}, Columns: {user_devices_data.columns}" ) print( f"User Usage Data Rows:{user_usage_data.rows}, Columns: {user_usage_data.columns}" ) print("--------------------------------") print("Before Join") print("--------------------------------") user_devices_data.show_by_range(1, 5, 0, 4) print("-------------------------------------") user_usage_data.show_by_range(1, 5, 0, 4) new_tb: Table = user_devices_data.join(ctx, user_usage_data, 'inner', 'sort', 0, 3) print("----------------------") print("New Table After Join (5 Records)") new_tb.show_by_range(0, 5, 0, 8) print("----------------------") data_ar: np.ndarray = new_tb.to_numpy() data_features: np.ndarray = data_ar[:, 2:6] data_learner: np.ndarray = data_ar[:, 6:7] x_train, y_train = data_features[0:100], data_learner[0:100] x_test, y_test = data_features[100:], data_learner[100:] x_train = np.asarray(x_train, dtype=np.float32) y_train = np.asarray(y_train, dtype=np.float32) x_test = np.asarray(x_test, dtype=np.float32) y_test = np.asarray(y_test, dtype=np.float32) x_train = torch.from_numpy(x_train).to(rank) y_train = torch.from_numpy(y_train).to(rank) x_test = torch.from_numpy(x_test).to(rank) y_test = torch.from_numpy(y_test).to(rank) # create model and move it to GPU with id rank model = Network().to(rank) ddp_model = DDP(model, device_ids=[rank]) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) optimizer.zero_grad() if rank == 0: print("Training A Dummy Model") for t in range(20): for x_batch, y_batch in zip(x_train, y_train): print(f"Epoch {t}", end='\r') prediction = ddp_model(x_batch) loss = loss_fn(prediction, y_batch) optimizer.zero_grad() loss.backward() optimizer.step() cleanup()
from pycylon import Table from pycylon.csv import csv_reader from pycylon import CylonContext ctx: CylonContext = CylonContext(config=None) tb: Table = csv_reader.read(ctx, '/tmp/user_usage_tm_1.csv', ',') print("Table Column Names") print(tb.column_names) print("Table Schema") print(tb.schema) print(tb[0].to_pandas()) print(tb[0:5].to_pandas()) print(tb[2:5].to_pandas()) print(tb[5].to_pandas()) print(tb[7].to_pandas()) tb.show_by_range(0, 4, 0, 4) print(tb[0:5].to_pandas()) ctx.finalize() import pyarrow as pa
from pycylon.csv import csv_reader from pycylon import Table from pycylon import CylonContext import argparse ctx: CylonContext = CylonContext("mpi") parser = argparse.ArgumentParser(description='PyCylon Table Conversion') parser.add_argument('--table1_path', type=str, help='Path to table 1 csv') parser.add_argument('--table2_path', type=str, help='Path to table 2 csv') args = parser.parse_args() tb1: Table = csv_reader.read(ctx, args.table1_path, ',') tb2: Table = csv_reader.read(ctx, args.table2_path, ',') configs = { 'join_type': 'left', 'algorithm': 'hash', 'left_col': 0, 'right_col': 0 } tb3: Table = tb1.distributed_join(ctx, table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_col=configs['left_col'], right_col=configs['right_col'])
import numpy as np import pandas as pd from pycylon import CylonContext from pycylon import Table from pycylon.csv import csv_reader ctx: CylonContext = CylonContext(config='mpi') base_path = "/tmp" rank = ctx.get_rank() user_devices_file = os.path.join(base_path, f'user_device_tm_{rank+1}.csv') user_usage_file = os.path.join(base_path, f'user_usage_tm_{rank+1}.csv') user_devices_data: Table = csv_reader.read(ctx, user_devices_file, ',') user_usage_data: Table = csv_reader.read(ctx, user_usage_file, ',') user_devices_df: pd.DataFrame = user_devices_data.to_pandas() user_usage_df: pd.DataFrame = user_usage_data.to_pandas() print( f"User Devices Data Rows:{user_devices_data.rows}, Columns: {user_devices_data.columns}" ) print( f"User Usage Data Rows:{user_usage_data.rows}, Columns: {user_usage_data.columns}" ) print("--------------------------------") print("Before Join") print("--------------------------------")
# See the License for the specific language governing permissions and # limitations under the License. ## from pycylon import CylonContext from pycylon import Table from pycylon.csv import csv_reader if __name__ == "__main__": ctx: CylonContext = CylonContext("mpi") rank = ctx.get_rank() + 1 csv1 = f"/tmp/user_device_tm_{rank}.csv" csv2 = f"/tmp/user_usage_tm_{rank}.csv" first_table: Table = csv_reader.read(ctx, csv1, ',') second_table: Table = csv_reader.read(ctx, csv2, ',') print(f"Table 1 & 2 Rows [{first_table.rows},{second_table.rows}], " f"Columns [{first_table.columns},{first_table.columns}]") joined_table: Table = first_table.distributed_join(ctx, table=second_table, join_type="inner", algorithm="sort", left_col=0, right_col=3) print( f"First table had : {first_table.rows} and Second table had : {second_table.rows}, " f"Joined has : {joined_table.rows}")
os.system(f"mkdir -p {base_dir}; rm -f {base_dir}/*.csv") csv1: str = os.path.join(base_dir, f"csv1_{srank}.csv") csv2: str = os.path.join(base_dir, f"csv2_{srank}.csv") src1 = f"{src_dir}/csv1_{srank}.csv" src2 = f"{src_dir}/csv2_{srank}.csv" print("src files ", src1, src2, flush=True) os.system(f"cp {src1} {csv1}") os.system(f"cp {src2} {csv2}") logging.info(f"{srank} Reading tables") table1: Table = csv_reader.read(ctx, csv1, ',') table2: Table = csv_reader.read(ctx, csv2, ',') try: logging.info( f"Table 1 & 2 Rows [{table1.rows},{table2.rows}], Columns [{table1.columns},{table1.columns}]" ) except Exception: raise Exception("Something went wrong in loading tables from disk") logging.info("Inner Join Start") RunJoin(rank=rank, ctx=ctx, table1=table1, table2=table2,
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## """ Run test: >>> python python/test/test_table.py --table_path /tmp/csv.csv """ from pycylon.csv import csv_reader from pycylon import Table from pycylon import CylonContext import argparse ctx: CylonContext = CylonContext(config=None) parser = argparse.ArgumentParser(description='PyCylon Table') parser.add_argument('--table_path', type=str, help='Path to table csv') args = parser.parse_args() tb1: Table = csv_reader.read(ctx, args.table_path, ',') print(f"Cylon Table Rows {tb1.rows}, Columns {tb1.columns}") ctx.finalize()
def load_data_to_cn_tables(): tb_train: Table = csv_reader.read(train_file_path, delimiter) tb_test: Table = csv_reader.read(test_file_path, delimiter) return tb_train, tb_test