Beispiel #1
0
    def test_ViewJoin(self):
        df1 = grizzly.read_table("t1")
        df2 = grizzly.read_table("t2")

        j = df1.join(df2,
                     on=(df1.actor1name == df2.actor2name) |
                     (df1["actor1countrycode"] <= df2["actor2countrycode"]),
                     how="left outer")
        cnt = j.count()
        self.assertEqual(cnt, 9899259)
Beispiel #2
0
    def test_joinTest(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] == 470259271]

        df2 = grizzly.read_table("events")

        joined = df.join(other=df2,
                         on=["globaleventid", "globaleventid"],
                         how="inner")

        actual = joined.generate()
        expected = "SELECT * FROM events $t1 inner join events $t2 ON $t1.globaleventid = $t2.globaleventid where $t1.globaleventid = 470259271"

        self.matchSnipped(actual, expected)
Beispiel #3
0
    def test_complexJoin(self):
        df1 = grizzly.read_table("t1")
        df2 = grizzly.read_table("t2")

        j = df1.join(df2,
                     on=(df1['a'] == df2['b']) & (df1['c'] <= df2['d']),
                     how="left outer")

        expected = "SELECT * FROM t1 $t0 LEFT OUTER JOIN t2 $t2 ON $t0.a = $t2.b AND $t0.c <= $t2.d".lower(
        )

        actual = j.generate().lower()

        self.matchSnipped(actual, expected)
Beispiel #4
0
    def test_New(self):
        df = grizzly.read_table("events")
        df = df["a"]
        df = df[df["a"] == 2]

        df2 = grizzly.read_table("events")
        df3 = df.join(df2, on=["a", "a"])
        actualDF = df.generate()
        expectedDF = "select $t0.a from events $t0 where $t0.a = 2"

        self.matchSnipped(actualDF, expectedDF)

        actualDF3 = df3.generate()
        expectedDF3 = "select $t0.a from events $t0 inner join (select * from events $t1) $t2 on $t0.a = $t2.a where $t0.a = 2"
Beispiel #5
0
    def test_groupByTableAggStar(self):
        df = grizzly.read_table("events")
        g = df.groupby("year")

        a = g.count()
        # print(f"cnt: {a}")
        self.assertEqual(a, 3)
Beispiel #6
0
    def test_selectStarFilterString(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] == 'abc']
        actual = df.generate()
        expected = "select * from events $t0  where $t0.globaleventid = 'abc'"

        self.matchSnipped(actual, expected)
Beispiel #7
0
    def test_Gt(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] > 468189636]
        actual = df.generate()
        expected = "select * from events $t0  where $t0.globaleventid > 468189636"

        self.matchSnipped(actual, expected)
Beispiel #8
0
    def test_groupByTableAgg(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] == 476829606]
        g = df.groupby(["year", "monthyear"])

        a = g.count("monthyear")
        # print(f"cnt: {a}")
        self.assertEqual(a, 1)
Beispiel #9
0
    def test_DistinctOneCol(self):
        df = grizzly.read_table("events")
        df = df['isrootevent'].distinct()
        actual = df.generate()
        # print(actual)
        expected = "select distinct $t1.isrootevent from events $t1"

        self.matchSnipped(actual, expected)
Beispiel #10
0
    def test_groupby(self):
        df = grizzly.read_table("events")
        g = df.groupby(["year", "actor1name"])
        a = g.agg(col="actor2name", aggType=AggregateType.MEAN)

        expected = "select $t0.year, $t0.actor1name, avg($t0.actor2name) from events $t0 group by $t0.year, $t0.actor1name"
        actual = a.generate()

        self.matchSnipped(actual, expected)
Beispiel #11
0
    def test_triJoin(self):
        df1 = grizzly.read_table("t1")
        df2 = grizzly.read_table("t2")
        df3 = grizzly.read_table("t3")
        df3 = df3[["b", "d"]]
        j = df1.join(df2,
                     on=(df1['a'] == df2['b']) & (df1['c'] <= df2['d']),
                     how="left outer")

        j = j[[df1.m, df2.x]]

        j2 = j.join(df3,
                    on=(j['m'] == df3['b']) & (j['x'] <= df3['d']),
                    how="inner")

        actual = j2.generate()
        expected = "select $t1.m, $t2.x, $t4.b, $t4.d from t1 $t1 left outer join t2 $t2 on $t1.a = $t2.b and $t1.c <= $t2.d inner join (select $t3.b, $t3.d from t3 $t3) $t4 on $t1.m = $t4.b and $t1.x <= $t4.d"
        self.matchSnipped(actual, expected)
Beispiel #12
0
    def test_selectStarGroupBy(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] == '468189636']
        g = df.groupby(["year", "monthyear"])

        actual = g.generate()
        expected = "select $t0.year, $t0.monthyear from events $t0 where $t0.globaleventid = '468189636' group by $t0.year, $t0.monthyear"

        self.matchSnipped(actual, expected)
Beispiel #13
0
    def test_selectColumnWithFilter(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] == 468189636]
        df = df['goldsteinscale']

        actual = df.generate()
        expected = "select $t0.goldsteinscale from events $t0 where $t0.globaleventid = 468189636"

        self.matchSnipped(actual, expected)
Beispiel #14
0
    def test_groupByWithAggTwice(self):
        df = grizzly.read_table("events")
        df = df[df['globaleventid'] == 476829606]
        g = df.groupby(["year", "monthyear"])

        agged = g.agg(col="actor2geo_type", aggType=AggregateType.COUNT)

        aggActual = agged.generate()
        aggExpected = "select $t0.year, $t0.monthyear, count($t0.actor2geo_type) from events $t0 where $t0.globaleventid = 476829606 group by $t0.year, $t0.monthyear"

        self.matchSnipped(aggActual, aggExpected)
Beispiel #15
0
    def test_toString(self):
        df = grizzly.read_table("events")

        df = df[df['globaleventid'] == 467268277]
        df = df[["actor1name", "actor2name", "globaleventid", "sourceurl"]]

        strDF = str(df)
        splt = strDF.split("\n")

        rows = df.count()
        # print(rows)
        dfLen = len(splt)
        rowsLen = rows + 3

        self.assertEqual(dfLen,
                         rowsLen)  # column names + top rule + bottom rule
Beispiel #16
0
    def test_parantheses(self):
        df = grizzly.read_table("t1")
        expr = (df['a'] == df['b']) & ((df['c'] <= df['d']) | ((df.f > 3) &
                                                               (df.e != None)))

        self.assertIsInstance(expr, LogicExpr, "top expression should be AND")
        self.assertIsInstance(expr.left, BoolExpr, "first left should be EQ")
        self.assertIsInstance(expr.right, LogicExpr,
                              "first right should be OR")

        andL = expr.left
        self.assertIsInstance(andL.left, ColRef,
                              "left of EQ should be a ColRef")
        self.assertEqual(andL.left.column, "a",
                         "left of EQ colref should be column a")

        self.assertIsInstance(andL.right, ColRef,
                              "right of EQ should be a ColRef")
        self.assertEqual(andL.right.column, "b",
                         "right of EQ colref should be column b")

        andR = expr.right
        self.assertIsInstance(andR.left, BoolExpr, "left of OR should be a LE")
        self.assertIsInstance(andR.right, LogicExpr,
                              "right of OR should be another AND")

        self.assertIsInstance(andR.left.left, ColRef,
                              "left of LE should be a ColRef")
        self.assertIsInstance(andR.left.right, ColRef,
                              "right of LE should be a ColRef")
        self.assertEqual(andR.left.left.column, "c")
        self.assertEqual(andR.left.right.column, "d")

        innerAnd = expr.right.right
        self.assertIsInstance(innerAnd.left, BoolExpr)
        self.assertIsInstance(innerAnd.left.left, ColRef)
        self.assertEqual(innerAnd.left.left.column, "f")
        self.assertIsInstance(innerAnd.left.right, Constant)
        self.assertEqual(innerAnd.left.right.value, 3)

        self.assertIsInstance(innerAnd.right, BoolExpr)
        self.assertIsInstance(innerAnd.right.left, ColRef)
        self.assertEqual(innerAnd.right.left.column, "e")
        self.assertIsNone(innerAnd.right.right)
Beispiel #17
0
    def test_show(self):
        df = grizzly.read_table("events")

        df = df[df['globaleventid'] <= 468189636]  #== 467268277
        df = df[["actor1name", "actor2name", "globaleventid", "sourceurl"]]

        from io import StringIO
        import sys
        try:
            bkp = sys.stdout
            sys.stdout = mystdout = StringIO()

            df.show(limit=None)

            output = mystdout.getvalue().splitlines()

            self.assertEqual(len(output), 2842 + 1)  #+1 for column names

        finally:
            sys.stdout = bkp
Beispiel #18
0
    def test_showPretty(self):
        df = grizzly.read_table("events")

        df = df[df['globaleventid'] <= 468189636]  #== 467268277
        df = df[["actor1name", "actor2name", "globaleventid", "sourceurl"]]

        from io import StringIO
        import sys
        try:
            bkp = sys.stdout
            sys.stdout = mystdout = StringIO()

            maxColWidth = 40

            df.show(pretty=True, maxColWidth=maxColWidth)

            output = mystdout.getvalue().splitlines()

            for row in output:
                for col in row:
                    self.assertLessEqual(len(col), maxColWidth)

        finally:
            sys.stdout = bkp
Beispiel #19
0
import grizzly
import sqlite3
from grizzly.relationaldbexecutor import RelationalExecutor
from grizzly.wrapper import wrapper
from grizzly.aggregates import AggregateType


con=wrapper.wrap(["./sample.xlsx","./sample.csv","./grizzly.db"])
grizzly.use(RelationalExecutor(con))

print("----------------show Excel Data sheet1------------------------")
dfexcel = grizzly.read_table("sheet1")
dfexcel.show(pretty=True)

print("----------------Aggregate function on excel------------------------")
g= dfexcel.groupby(["Country","Product"])
a = g.agg(col="Product",aggType=AggregateType.COUNT)

print(a.generate())
a.show(pretty=True)

print("---------------show Excel Data sheet2-------------------------")
dfexcel2 = grizzly.read_table("sheet2")
dfexcel2.show(pretty=True)

print("---------------join between two excel sheet-------------------------")
j= dfexcel.join(dfexcel2, on=(dfexcel.Segment == dfexcel2.Segment), how='inner')
print(j.generate())
j.show(pretty=True)

print("---------------Show csv data-------------------------")
Beispiel #20
0
 def test_selectCountStar(self):
     df = grizzly.read_table("events")
     self.assertEqual(df.count(), 30354)
Beispiel #21
0
 def test_selectStar(self):
     df = grizzly.read_table("events")
     actual = df.generate()
     expected = "select * from events $t0"
     self.matchSnipped(actual, expected)
Beispiel #22
0
 def test_selectCountCol(self):
     df = grizzly.read_table("events")
     cnt = df.count('actor2name')
     self.assertGreater(cnt, 0)
Beispiel #23
0
 def test_DistinctAll(self):
     df = grizzly.read_table("events")
     df = df.distinct()
     actual = df.generate()
     expected = "SELECT distinct * FROM events $t1".lower()
     self.matchSnipped(actual, expected)
Beispiel #24
0
import grizzly
import sqlite3
from grizzly.relationaldbexecutor import RelationalExecutor

con = sqlite3.connect("grizzly.db")

grizzly.use(RelationalExecutor(con))

df = grizzly.read_table("events")

df[df["globaleventid"] == 470747760]  # filter
df = df[["actor1name", "actor2name"]]

df.show(pretty=True)

print("----------------------------------------")

df1 = grizzly.read_table("t1")
df2 = grizzly.read_table("t2")

j = df1.join(df2,
             on=(df1.actor1name == df2.actor2name) |
             (df1["actor1countrycode"] <= df2["actor2countrycode"]),
             how="left outer")
print(j.generate())
cnt = j.count()
print(f"join result contais {cnt} elments")

print("----------------------------------------")

df = grizzly.read_table("events")
Beispiel #25
0
 def test_DistinctTwoCols(self):
     df = grizzly.read_table("events")
     df = df[['y', "x"]].distinct()
     actual = df.generate()
     expected = "select distinct $t0.y, $t0.x from events $t0"
     self.matchSnipped(actual, expected)
Beispiel #26
0
    def test_collectWithHeader(self):
        df = grizzly.read_table("events")
        arr = df.collect(includeHeader=True)

        self.assertEqual(len(arr), 30354 + 1)