Esempio n. 1
0
def dist_merge():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 5),
        'second': cp.random.randint(100, 110, 5)
    })
    df2 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 5),
        'second': cp.random.randint(100, 110, 5)
    })

    print(df1)
    print(df2)
    df3 = df1.merge(right=df2,
                    on="first",
                    how="left",
                    left_on=None,
                    right_on=None,
                    left_index=False,
                    right_index=False,
                    env=env)
    print("distributed joined df:\n", df3)
    env.finalize()
Esempio n. 2
0
def local_join():
    df1 = gcy.DataFrame({
        'first': cp.random.rand(10),
        'second': cp.random.rand(10)
    })
    df2 = gcy.DataFrame({
        'first': cp.random.rand(10),
        'second': cp.random.rand(10)
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.join(df2)
    print("locally joined df: \n", df3)
Esempio n. 3
0
def local_concat():
    df1 = gcy.DataFrame({
        'name': ["John", "Smith"],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'age': [44, 66],
        'name': ["John", "Joseph"],
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = gcy.concat(df1, df2)
    print("locally set difference: \n", df3)
Esempio n. 4
0
def local_union():
    df1 = gcy.DataFrame({
        'name': ["John", "Smith"],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'age': [44, 66],
        'name': ["John", "Joseph"],
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.set_union(df2)
    print("set union: \n", df3)
    df3 = df1.set_union(df2, keep_duplicates=True)
    print("set union with duplicates: \n", df3)
Esempio n. 5
0
def local_intersection():
    df1 = gcy.DataFrame({
        'name': ["John", "Smith"],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'age': [44, 66],
        'name': ["John", "Joseph"],
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.set_intersect(df2)
    print("set intersect: \n", df3)
    df3 = df1.set_intersect(df2, subset=["age"])
    print("set intersect with subset: \n", df3)
Esempio n. 6
0
def local_groupby():
    df = gcy.DataFrame({
        'a': [1, 1, 1, 2, 2],
        'b': [1, 1, 2, 2, 3],
        'c': [1, 2, 3, 4, 5]
    })
    print("df: \n", df)

    gby = df.groupby("a")
    print("df grouped-by on column 'a', performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())
    print(
        "performed 'sum' on the same groupby object, aggregated on the column 'b' only: \n",
        gby["b"].sum())
    print(
        "performed 'sum' on the same groupby object, aggregated on the column 'c' only: \n",
        gby["c"].sum())
    print("performed 'mean' on the same groupby object: \n", gby.mean())
    print("iterate through the groups: ")
    for name, gr in gby:
        print("group: ", name)
        print(gr)

    print("iterate through the group indices: ")
    for key in gby.groups:
        print("group key: ", key)
        print(gby.groups[key])

    gby = df.groupby(["a", "b"])
    print("df grouped-by on columns 'a' and 'b', performed 'sum': \n",
          gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())
Esempio n. 7
0
def local_merge():
    df1 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 5),
        'second': cp.random.randint(100, 110, 5)
    })
    df2 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 5),
        'second': cp.random.randint(100, 110, 5)
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.merge(right=df2,
                    how="left",
                    on="first",
                    left_index=False,
                    right_index=False)
    print("locally merged df: \n", df3)
Esempio n. 8
0
def dist_concat():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'first': cp.random.randint(0, 10, 5),
        'second': cp.random.randint(100, 110, 5)
    })
    df2 = gcy.DataFrame({
        'second': cp.random.randint(100, 110, 5),
        'first': cp.random.randint(0, 10, 5)
    })
    print(df1)
    print(df2)
    df3 = gcy.concat([df1, df2], join="inner", env=env)
    print("distributed concated df:\n", df3)
    env.finalize()
Esempio n. 9
0
def dist_join():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'first': cp.random.rand(10),
        'second': cp.random.rand(10)
    })
    df2 = gcy.DataFrame({
        'first': cp.random.rand(10),
        'second': cp.random.rand(10)
    })
    print(df1)
    print(df2)
    df3 = df1.join(other=df2, env=env)
    print("distributed joined df:\n", df3)
    env.finalize()
Esempio n. 10
0
def local_diff():
    df1 = gcy.DataFrame({
        'name': ["John", "Smith"],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'age': [44, 66],
        'name': ["John", "Joseph"],
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.set_difference(df2)
    print("df1 set difference df2: \n", df3)
    df3 = df2.set_difference(df1)
    print("df2 set difference df1: \n", df3)
    df3 = df1.set_difference(df2, subset=["name"])
    print("df1 set difference df2 on subset=['name']: \n", df3)
    df3 = df2.set_difference(df1, subset=["name"])
    print("df2 set difference df1 on subset=['name']: \n", df3)
Esempio n. 11
0
def dist_groupby():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    if env.rank == 0:
        df = gcy.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [1, 3, 5]})
        print("df on rank 0: \n", df)
    elif env.rank == 1:
        df = gcy.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 4], 'c': [2, 4, 6]})
        print("df on rank 1: \n", df)

    gby = df.groupby("a", env=env)
    print("df grouped-by on column 'a', performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())
    print(
        "performed 'sum' on the same groupby object, aggregated on the column 'b' only: \n",
        gby["b"].sum())
    print("performed 'mean' on the same groupby object: \n", gby.mean())
    print("sizes of each group: \n", gby.size())

    gby = df.groupby(["a", "b"], env=env)
    print("df grouped-by on columns a and b, performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())

    # groupby on index column with "level" parameter
    df1 = df.set_index("a")
    gby = df1.groupby(level="a", env=env)
    print("df grouped-by on index 'a', performed 'sum': \n", gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())

    # if the original dataframe has many columns and
    # we only want to perform the groupby on some columns only,
    # the best way is to create a new dataframe with a subset of columns and
    # perform the groupby on this new dataframe
    df2 = df[["a", "b"]]
    print("two columns projected dataframe:\n", df2)
    gby = df2.groupby("a", env=env)
    print("grouped-by on column 'a' of projected df, performed 'sum': \n",
          gby.sum())
    print("performed 'max' on the same groupby object: \n", gby.max())

    env.finalize()
Esempio n. 12
0
def dist_intersection():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 66],
    })
    print(df1)
    print(df2)
    df3 = df1.set_intersect(other=df2, env=env)
    print("distributed set intersection:\n", df3)

    df3 = df1.set_intersect(other=df2, subset=["age"], env=env)
    print("distributed set intersection with a subset of columns:\n", df3)
    env.finalize()
Esempio n. 13
0
def set_index():
    df1 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 20),
        'second': cp.random.randint(100, 110, 20)
    })
    print("df1: \n", df1)
    df2 = df1.set_index("first")
    print("set index to first: \n")
    print(df2)
    df3 = df2.reset_index()
    print("index reset: \n", df3)
Esempio n. 14
0
def dist_union():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'age': [44, 66],
        'weight': [60 + env.rank, 80 + env.rank],
    })
    print(df1)
    print(df2)
    df3 = df1.set_union(other=df2, env=env)
    print("distributed set union:\n", df3)

    df3 = df1.set_union(other=df2, keep_duplicates=True, ignore_index=True, env=env)
    print("distributed set union with duplicates:\n", df3)
    env.finalize()
Esempio n. 15
0
def drop_cuplicates():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)
    df1 = gcy.DataFrame({
        'first': cp.random.randint(100, 110, 20),
        'second': cp.random.randint(100, 110, 20)
    })
    print("df1: \n", df1)
    df2 = df1.drop_duplicates(ignore_index=True, env=env)
    print("duplicates dropped: \n", df2) if df2 else print(
        "duplicates dropped: \n", df1)
    env.finalize()
Esempio n. 16
0
def dist_diff():
    env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
    print("CylonEnv Initialized: My rank: ", env.rank)

    df1 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 55],
    })
    df2 = gcy.DataFrame({
        'weight': [60 + env.rank, 80 + env.rank],
        'age': [44, 66],
    })
    print("df1: \n", df1)
    print("df2: \n", df2)
    df3 = df1.set_difference(other=df2, env=env)
    print("df1 distributed set difference df2:\n", df3)
    df3 = df2.set_difference(other=df1, env=env)
    print("df2 distributed set difference df1:\n", df3)
#    df3 = df1.set_difference(df2, subset=["age"], env=env)
#    print("df1 distributed set difference df2 on subset=['age']: \n", df3)
    df3 = df2.set_difference(df1, subset=["age"], env=env)
    print("df2 distributed set difference df1 on subset=['age']: \n", df3)
    env.finalize()
Esempio n. 17
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

import cupy as cp
import pycylon as cy
import pygcylon as gcy

env: cy.CylonEnv = cy.CylonEnv(config=cy.MPIConfig(), distributed=True)
print("CylonContext Initialized: My rank: ", env.rank)

start = 100 * env.rank
df = gcy.DataFrame({
    'first': cp.random.randint(start, start + 10, 10),
    'second': cp.random.randint(start, start + 10, 10)
})
print("initial df from rank: ", env.rank, "\n", df)

shuffledDF = df.shuffle(on="first", ignore_index=True, env=env)

print("shuffled df from rank: ", env.rank, "\n", shuffledDF)

env.finalize()
print("after finalize from the rank:", env.rank)