def dist_merge(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 5), 'second': cp.random.randint(100, 110, 5) }) df2 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 5), 'second': cp.random.randint(100, 110, 5) }) print(df1) print(df2) df3 = df1.merge(right=df2, on="first", how="left", left_on=None, right_on=None, left_index=False, right_index=False, env=env) print("distributed joined df:\n", df3) env.finalize()
def local_join(): df1 = gcy.DataFrame({ 'first': cp.random.rand(10), 'second': cp.random.rand(10) }) df2 = gcy.DataFrame({ 'first': cp.random.rand(10), 'second': cp.random.rand(10) }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.join(df2) print("locally joined df: \n", df3)
def local_concat(): df1 = gcy.DataFrame({ 'name': ["John", "Smith"], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'age': [44, 66], 'name': ["John", "Joseph"], }) print("df1: \n", df1) print("df2: \n", df2) df3 = gcy.concat(df1, df2) print("locally set difference: \n", df3)
def local_union(): df1 = gcy.DataFrame({ 'name': ["John", "Smith"], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'age': [44, 66], 'name': ["John", "Joseph"], }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.set_union(df2) print("set union: \n", df3) df3 = df1.set_union(df2, keep_duplicates=True) print("set union with duplicates: \n", df3)
def local_intersection(): df1 = gcy.DataFrame({ 'name': ["John", "Smith"], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'age': [44, 66], 'name': ["John", "Joseph"], }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.set_intersect(df2) print("set intersect: \n", df3) df3 = df1.set_intersect(df2, subset=["age"]) print("set intersect with subset: \n", df3)
def local_groupby(): df = gcy.DataFrame({ 'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5] }) print("df: \n", df) gby = df.groupby("a") print("df grouped-by on column 'a', performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) print( "performed 'sum' on the same groupby object, aggregated on the column 'b' only: \n", gby["b"].sum()) print( "performed 'sum' on the same groupby object, aggregated on the column 'c' only: \n", gby["c"].sum()) print("performed 'mean' on the same groupby object: \n", gby.mean()) print("iterate through the groups: ") for name, gr in gby: print("group: ", name) print(gr) print("iterate through the group indices: ") for key in gby.groups: print("group key: ", key) print(gby.groups[key]) gby = df.groupby(["a", "b"]) print("df grouped-by on columns 'a' and 'b', performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max())
def local_merge(): df1 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 5), 'second': cp.random.randint(100, 110, 5) }) df2 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 5), 'second': cp.random.randint(100, 110, 5) }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.merge(right=df2, how="left", on="first", left_index=False, right_index=False) print("locally merged df: \n", df3)
def dist_concat(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.randint(0, 10, 5), 'second': cp.random.randint(100, 110, 5) }) df2 = gcy.DataFrame({ 'second': cp.random.randint(100, 110, 5), 'first': cp.random.randint(0, 10, 5) }) print(df1) print(df2) df3 = gcy.concat([df1, df2], join="inner", env=env) print("distributed concated df:\n", df3) env.finalize()
def dist_join(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.rand(10), 'second': cp.random.rand(10) }) df2 = gcy.DataFrame({ 'first': cp.random.rand(10), 'second': cp.random.rand(10) }) print(df1) print(df2) df3 = df1.join(other=df2, env=env) print("distributed joined df:\n", df3) env.finalize()
def local_diff(): df1 = gcy.DataFrame({ 'name': ["John", "Smith"], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'age': [44, 66], 'name': ["John", "Joseph"], }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.set_difference(df2) print("df1 set difference df2: \n", df3) df3 = df2.set_difference(df1) print("df2 set difference df1: \n", df3) df3 = df1.set_difference(df2, subset=["name"]) print("df1 set difference df2 on subset=['name']: \n", df3) df3 = df2.set_difference(df1, subset=["name"]) print("df2 set difference df1 on subset=['name']: \n", df3)
def dist_groupby(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) if env.rank == 0: df = gcy.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [1, 3, 5]}) print("df on rank 0: \n", df) elif env.rank == 1: df = gcy.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 4], 'c': [2, 4, 6]}) print("df on rank 1: \n", df) gby = df.groupby("a", env=env) print("df grouped-by on column 'a', performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) print( "performed 'sum' on the same groupby object, aggregated on the column 'b' only: \n", gby["b"].sum()) print("performed 'mean' on the same groupby object: \n", gby.mean()) print("sizes of each group: \n", gby.size()) gby = df.groupby(["a", "b"], env=env) print("df grouped-by on columns a and b, performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) # groupby on index column with "level" parameter df1 = df.set_index("a") gby = df1.groupby(level="a", env=env) print("df grouped-by on index 'a', performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) # if the original dataframe has many columns and # we only want to perform the groupby on some columns only, # the best way is to create a new dataframe with a subset of columns and # perform the groupby on this new dataframe df2 = df[["a", "b"]] print("two columns projected dataframe:\n", df2) gby = df2.groupby("a", env=env) print("grouped-by on column 'a' of projected df, performed 'sum': \n", gby.sum()) print("performed 'max' on the same groupby object: \n", gby.max()) env.finalize()
def dist_intersection(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 66], }) print(df1) print(df2) df3 = df1.set_intersect(other=df2, env=env) print("distributed set intersection:\n", df3) df3 = df1.set_intersect(other=df2, subset=["age"], env=env) print("distributed set intersection with a subset of columns:\n", df3) env.finalize()
def set_index(): df1 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 20), 'second': cp.random.randint(100, 110, 20) }) print("df1: \n", df1) df2 = df1.set_index("first") print("set index to first: \n") print(df2) df3 = df2.reset_index() print("index reset: \n", df3)
def dist_union(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'age': [44, 66], 'weight': [60 + env.rank, 80 + env.rank], }) print(df1) print(df2) df3 = df1.set_union(other=df2, env=env) print("distributed set union:\n", df3) df3 = df1.set_union(other=df2, keep_duplicates=True, ignore_index=True, env=env) print("distributed set union with duplicates:\n", df3) env.finalize()
def drop_cuplicates(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'first': cp.random.randint(100, 110, 20), 'second': cp.random.randint(100, 110, 20) }) print("df1: \n", df1) df2 = df1.drop_duplicates(ignore_index=True, env=env) print("duplicates dropped: \n", df2) if df2 else print( "duplicates dropped: \n", df1) env.finalize()
def dist_diff(): env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonEnv Initialized: My rank: ", env.rank) df1 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 55], }) df2 = gcy.DataFrame({ 'weight': [60 + env.rank, 80 + env.rank], 'age': [44, 66], }) print("df1: \n", df1) print("df2: \n", df2) df3 = df1.set_difference(other=df2, env=env) print("df1 distributed set difference df2:\n", df3) df3 = df2.set_difference(other=df1, env=env) print("df2 distributed set difference df1:\n", df3) # df3 = df1.set_difference(df2, subset=["age"], env=env) # print("df1 distributed set difference df2 on subset=['age']: \n", df3) df3 = df2.set_difference(df1, subset=["age"], env=env) print("df2 distributed set difference df1 on subset=['age']: \n", df3) env.finalize()
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## import cupy as cp import pycylon as cy import pygcylon as gcy env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True) print("CylonContext Initialized: My rank: ", env.rank) start = 100 * env.rank df = gcy.DataFrame({ 'first': cp.random.randint(start, start + 10, 10), 'second': cp.random.randint(start, start + 10, 10) }) print("initial df from rank: ", env.rank, "\n", df) shuffledDF = df.shuffle(on="first", ignore_index=True, env=env) print("shuffled df from rank: ", env.rank, "\n", shuffledDF) env.finalize() print("after finalize from the rank:", env.rank)