def dist_merge(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 5), 'second': cp.random.randint(100, 110, 5) }) df2 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 5), 'second': cp.random.randint(100, 110, 5) }) print(df1) print(df2) df3 = df1.merge(right=df2, on="first", how="left", left_on=None, right_on=None, left_index=False, right_index=False, env=env) print("distributed joined df:\n", df3) env.finalize()
def test_shuffle(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonContext Initialized: My rank: ", env.rank) input_file = "data/input/cities_a_" + str(env.rank) + ".csv" str_shuffle_file = "data/output/shuffle_str_cities_a_" + str( env.rank) + ".csv" int_shuffle_file = "data/output/shuffle_int_cities_a_" + str( env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file)) str_shuffled = df1.shuffle(on="state_id", ignore_index=True, env=env) str_shuffled_sorted = str_shuffled.to_cudf()\ .sort_values(by=["state_id", "city", "population"], ignore_index=True) int_shuffled = df1.shuffle(on="population", ignore_index=True, env=env) int_shuffled_sorted = int_shuffled.to_cudf()\ .sort_values(by=["state_id", "city", "population"], ignore_index=True) str_shuffled_saved = cudf.read_csv(str_shuffle_file)\ .sort_values(by=["state_id", "city", "population"], ignore_index=True) int_shuffled_saved = cudf.read_csv(int_shuffle_file)\ .sort_values(by=["state_id", "city", "population"], ignore_index=True) assert str_shuffled_sorted.equals(str_shuffled_saved), \ "String based Shuffled DataFrame and DataFrame from file are not equal" assert int_shuffled_sorted.equals(int_shuffled_saved), \ "Integer based Shuffled DataFrame and DataFrame from file are not equal"
def dist_drop_duplicates(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" unionFile = "data/output/union_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) print("df1: \n", df1) print("df2: \n", df2) concatedDf = gcy.concat([df1, df2], env=env) duplicates_dropped = concatedDf.drop_duplicates(ignore_index=True, env=env) d_dropped_sorted = duplicates_dropped.to_cudf().sort_values( by=["city", "state_id"], ignore_index=True) saved_union = cudf.read_csv(unionFile).sort_values(by=["city", "state_id"], ignore_index=True) print(env.rank, " equal") if d_dropped_sorted.equals(saved_union) else print( env.rank, " not equal") env.finalize()
def test_drop_duplicates(): """ We first perform concatenation of two dataframes, then drop duplicates. Resulting dataframe must be equal to the union of the two original dataframe """ env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" unionFile = "data/output/union_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) concatedDf = gcy.concat([df1, df2], env=env) duplicates_dropped = concatedDf.drop_duplicates(ignore_index=True, env=env) d_dropped_sorted = duplicates_dropped.to_cudf().sort_values( by=["city", "state_id"], ignore_index=True) saved_union = cudf.read_csv(unionFile).sort_values(by=["city", "state_id"], ignore_index=True) assert d_dropped_sorted.equals(saved_union), \ "Duplicates dropped DataFrame and the DataFrame from file are not equal"
def test_diff(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" diffFile1 = "data/output/diff_df1-df2_" + str(env.rank) + ".csv" diffFile2 = "data/output/diff_df2-df1_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) diff1 = df1.set_difference(other=df2, env=env) diff2 = df2.set_difference(other=df1, env=env) # sort difference dataframes diff1_sorted = diff1.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) diff2_sorted = diff2.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) saved_diff1 = cudf.read_csv(diffFile1).sort_values(by=["city", "state_id"], ignore_index=True) saved_diff2 = cudf.read_csv(diffFile2).sort_values(by=["city", "state_id"], ignore_index=True) assert diff1_sorted.equals(saved_diff1), \ "First Difference DataFrame and the DataFrame from file are not equal" assert diff2_sorted.equals(saved_diff2), \ "Second Difference DataFrame and the DataFrame from file are not equal"
def dist_union(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" unionFile = "data/output/union_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) print("df1: \n", df1) print("df2: \n", df2) unionDf = df1.set_union(other=df2, env=env) # sort union dataframes union_sorted = unionDf.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) savedUnion = cudf.read_csv(unionFile).sort_values(by=["city", "state_id"], ignore_index=True) print(env.rank, " equal") if savedUnion.equals(union_sorted) else print( env.rank, " not equal") env.finalize()
def dist_diff(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" diffFile1 = "data/output/diff_df1-df2_" + str(env.rank) + ".csv" diffFile2 = "data/output/diff_df2-df1_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) print("df1: \n", df1) print("df2: \n", df2) diff1 = df1.set_difference(other=df2, env=env) diff2 = df2.set_difference(other=df1, env=env) # sort difference dataframes diff1_sorted = diff1.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) diff2_sorted = diff2.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) savedDiff1 = cudf.read_csv(diffFile1).sort_values(by=["city", "state_id"], ignore_index=True) savedDiff2 = cudf.read_csv(diffFile2).sort_values(by=["city", "state_id"], ignore_index=True) print(env.rank, " equal") if savedDiff1.equals(diff1_sorted) else print( env.rank, " not equal") print(env.rank, " equal") if savedDiff2.equals(diff2_sorted) else print( env.rank, " not equal") env.finalize()
def drop_cuplicates(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 20), 'second': cp.random.randint(100, 110, 20) }) print("df1: \n", df1) df2 = df1.drop_duplicates(ignore_index=True, env=env) print("duplicates dropped: \n", df2) if df2 else print( "duplicates dropped: \n", df1) env.finalize()
def gen_join_test_data(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) input_file1 = "data/input/cities_a_" + str(env.rank) + ".csv" input_file2 = "data/input/cities_b_" + str(env.rank) + ".csv" join_file = "data/output/join_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file1, index_col="state_id")) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file2, index_col="state_id")) joined_df = df1.join(other=df2, how="inner", env=env) joined_df.to_cudf().to_csv(join_file) print(env.rank, " written join_file to the file: ", join_file) env.finalize()
def dist_join(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.rand(10), 'second': cp.random.rand(10) }) df2 = gcy.DataFrame({ 'first': cp.random.rand(10), 'second': cp.random.rand(10) }) print(df1) print(df2) df3 = df1.join(other=df2, env=env) print("distributed joined df:\n", df3) env.finalize()
def dist_concat(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.randint(0, 10, 5), 'second': cp.random.randint(100, 110, 5) }) df2 = gcy.DataFrame({ 'second': cp.random.randint(100, 110, 5), 'first': cp.random.randint(0, 10, 5) }) print(df1) print(df2) df3 = gcy.concat([df1, df2], join="inner", env=env) print("distributed concated df:\n", df3) env.finalize()
def gen_union_files(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" unionFile = "data/output/union_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) print("df1: \n", df1) print("df2: \n", df2) unionDf = df1.set_union(other=df2, env=env) unionDf.to_cudf().to_csv(unionFile, index=False) print(env.rank, " written unionFile to the file: ", unionFile) env.finalize()
def gen_concat_files(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" concatFile = "data/output/concat_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) print("df1: \n", df1) print("df2: \n", df2) concatedDf = gcy.concat([df1, df2], env=env) concatedDf.to_cudf().to_csv(concatFile, index=False) print(env.rank, " written concatFile to the file: ", concatFile) env.finalize()
def gen_groupby_test_data(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile = "data/input/cities_a_" + str(env.rank) + ".csv" gbyFile1 = "data/output/groupby_sum_cities_a_" + str(env.rank) + ".csv" gbyFile2 = "data/output/groupby_max_cities_a_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile)) df1 = df1[["state_id", "population"]] print("df1: \n", df1) gby = df1.groupby("state_id", env=env) gby.sum().to_csv(gbyFile1) gby.max().to_csv(gbyFile2) print(env.rank, " written gbyFile1 to the file: ", gbyFile1) print(env.rank, " written gbyFile2 to the file: ", gbyFile2) env.finalize()
def dist_groupby(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) if env.rank == 0: df = gcy.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [1, 3, 5]}) print("df on rank 0: \n", df) elif env.rank == 1: df = gcy.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 4], 'c': [2, 4, 6]}) print("df on rank 1: \n", df) gby = df.groupby("a", env=env) print("df grouped-by on column 'a', performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) print( "performed 'sum' on the same groupby object, aggregated on the column 'b' only: \n", gby["b"].sum()) print("performed 'mean' on the same groupby object: \n", gby.mean()) print("sizes of each group: \n", gby.size()) gby = df.groupby(["a", "b"], env=env) print("df grouped-by on columns a and b, performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) # groupby on index column with "level" parameter df1 = df.set_index("a") gby = df1.groupby(level="a", env=env) print("df grouped-by on index 'a', performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) # if the original dataframe has many columns and # we only want to perform the groupby on some columns only, # the best way is to create a new dataframe with a subset of columns and # perform the groupby on this new dataframe df2 = df[["a", "b"]] print("two columns projected dataframe:\n", df2) gby = df2.groupby("a", env=env) print("grouped-by on column 'a' of projected df, performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) env.finalize()
def test_intersect(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" intersectFile = "data/output/intersect_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) intersectDf = df1.set_intersect(other=df2, env=env) intersect_sorted = intersectDf.to_cudf().sort_values( by=["city", "state_id"], ignore_index=True) saved_intersect = cudf.read_csv(intersectFile).sort_values( by=["city", "state_id"], ignore_index=True) assert intersect_sorted.equals(saved_intersect), \ "Intersect DataFrame and the DataFrame from file are not equal"
def dist_union(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'age': [44, 66], 'weight': [60 + env.rank, 80 + env.rank], }) print(df1) print(df2) df3 = df1.set_union(other=df2, env=env) print("distributed set union:\n", df3) df3 = df1.set_union(other=df2, keep_duplicates=True, ignore_index=True, env=env) print("distributed set union with duplicates:\n", df3) env.finalize()
def dist_intersection(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 66], }) print(df1) print(df2) df3 = df1.set_intersect(other=df2, env=env) print("distributed set intersection:\n", df3) df3 = df1.set_intersect(other=df2, subset=["age"], env=env) print("distributed set intersection with a subset of columns:\n", df3) env.finalize()
def test_concat(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" concatFile = "data/output/concat_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) concatedDf = gcy.concat([df1, df2], env=env) concated_sorted = concatedDf.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) saved_concated = cudf.read_csv(concatFile).sort_values( by=["city", "state_id"], ignore_index=True) assert concated_sorted.equals(saved_concated), \ "Concatanated DataFrame and the DataFrame from file are not equal"
def gen_shuffle_test_data(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) input_file = "data/input/cities_a_" + str(env.rank) + ".csv" str_shuffle_file = "data/output/shuffle_str_cities_a_" + str( env.rank) + ".csv" int_shuffle_file = "data/output/shuffle_int_cities_a_" + str( env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(input_file)) str_shuffled = df1.shuffle(on="state_id", ignore_index=True, env=env) str_shuffled.to_cudf().to_csv(str_shuffle_file, index=False) int_shuffled = df1.shuffle(on="population", ignore_index=True, env=env) int_shuffled.to_cudf().to_csv(int_shuffle_file, index=False) print(env.rank, " written gbyFile1 to the file: ", str_shuffle_file) print(env.rank, " written gbyFile2 to the file: ", int_shuffle_file) env.finalize()
def gen_diff_files(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" diffFile1 = "data/output/diff_df1-df2_" + str(env.rank) + ".csv" diffFile2 = "data/output/diff_df2-df1_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) print("df1: \n", df1) print("df2: \n", df2) diff1 = df1.set_difference(other=df2, env=env) diff2 = df2.set_difference(other=df1, env=env) diff1.to_cudf().to_csv(diffFile1, index=False) diff2.to_cudf().to_csv(diffFile2, index=False) print(env.rank, " written diff1 to the file: ", diffFile1) print(env.rank, " written diff2 to the file: ", diffFile2) env.finalize()
def test_groupby(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonContext Initialized: My rank: ", env.rank) inputFile = "data/input/cities_a_" + str(env.rank) + ".csv" gbyFile1 = "data/output/groupby_sum_cities_a_" + str(env.rank) + ".csv" gbyFile2 = "data/output/groupby_max_cities_a_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile)) df1 = df1[["state_id", "population"]] gby = df1.groupby("state_id", env=env) sum_df = gby.sum().sort_index() max_df = gby.max().sort_index() saved_sum_df = cudf.read_csv(gbyFile1, index_col="state_id").sort_index() saved_max_df = cudf.read_csv(gbyFile2, index_col="state_id").sort_index() assert sum_df.equals( saved_sum_df ), "Groupbyed Sum DataFrame and DataFrame from file are not equal" assert max_df.equals( saved_max_df ), "Groupbyed Maz DataFrame and DataFrame from file are not equal"
def dist_diff(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 66], }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.set_difference(other=df2, env=env) print("df1 distributed set difference df2:\n", df3) df3 = df2.set_difference(other=df1, env=env) print("df2 distributed set difference df1:\n", df3) # df3 = df1.set_difference(df2, subset=["age"], env=env) # print("df1 distributed set difference df2 on subset=['age']: \n", df3) df3 = df2.set_difference(df1, subset=["age"], env=env) print("df2 distributed set difference df1 on subset=['age']: \n", df3) env.finalize()
def test_join(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonContext Initialized: My rank: ", env.rank) input_file1 = "data/input/cities_a_" + str(env.rank) + ".csv" input_file2 = "data/input/cities_b_" + str(env.rank) + ".csv" join_file = "data/output/join_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf( cudf.read_csv(input_file1, index_col="state_id")) df2 = gcy.DataFrame.from_cudf( cudf.read_csv(input_file2, index_col="state_id")) joined_df = df1.join(other=df2, how="inner", env=env) joined_sorted = joined_df.to_cudf() \ .sort_values(by=["cityl", "populationl", "cityr", "populationr"]) saved_sorted = cudf.read_csv(join_file, index_col="state_id") \ .sort_values(by=["cityl", "populationl", "cityr", "populationr"]) assert len(joined_sorted) == len(saved_sorted) assert joined_sorted.equals(saved_sorted), \ "Joined DataFrame and DataFrame from file are not equal"
def dist_concat(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) inputFile1 = "data/input/cities_a_" + str(env.rank) + ".csv" inputFile2 = "data/input/cities_b_" + str(env.rank) + ".csv" concatFile = "data/output/concat_cities_" + str(env.rank) + ".csv" df1 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile1)) df2 = gcy.DataFrame.from_cudf(cudf.read_csv(inputFile2)) concatedDf = gcy.concat([df1, df2], env=env) # sort dataframe concated_sorted = concatedDf.to_cudf().sort_values(by=["city", "state_id"], ignore_index=True) saved_concated = cudf.read_csv(concatFile).sort_values( by=["city", "state_id"], ignore_index=True) print(env.rank, " equal") if concated_sorted.equals(saved_concated) else print( env.rank, " not equal") env.finalize()
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## import cupy as cp import pycylon as cy import pygcylon as gcy env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonContext Initialized: My rank: ", env.rank) start = 100 * env.rank df = gcy.DataFrame({ 'first': cp.random.randint(start, start + 10, 10), 'second': cp.random.randint(start, start + 10, 10) }) print("initial df from rank: ", env.rank, "\n", df) shuffledDF = df.shuffle(on="first", ignore_index=True, env=env) print("shuffled df from rank: ", env.rank, "\n", shuffledDF) env.finalize() print("after finalize from the rank:", env.rank)