def test_ViewJoin(self): df1 = grizzly.read_table("t1") df2 = grizzly.read_table("t2") j = df1.join(df2, on=(df1.actor1name == df2.actor2name) | (df1["actor1countrycode"] <= df2["actor2countrycode"]), how="left outer") cnt = j.count() self.assertEqual(cnt, 9899259)
def test_joinTest(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == 470259271] df2 = grizzly.read_table("events") joined = df.join(other=df2, on=["globaleventid", "globaleventid"], how="inner") actual = joined.generate() expected = "SELECT * FROM events $t1 inner join events $t2 ON $t1.globaleventid = $t2.globaleventid where $t1.globaleventid = 470259271" self.matchSnipped(actual, expected)
def test_complexJoin(self): df1 = grizzly.read_table("t1") df2 = grizzly.read_table("t2") j = df1.join(df2, on=(df1['a'] == df2['b']) & (df1['c'] <= df2['d']), how="left outer") expected = "SELECT * FROM t1 $t0 LEFT OUTER JOIN t2 $t2 ON $t0.a = $t2.b AND $t0.c <= $t2.d".lower( ) actual = j.generate().lower() self.matchSnipped(actual, expected)
def test_New(self): df = grizzly.read_table("events") df = df["a"] df = df[df["a"] == 2] df2 = grizzly.read_table("events") df3 = df.join(df2, on=["a", "a"]) actualDF = df.generate() expectedDF = "select $t0.a from events $t0 where $t0.a = 2" self.matchSnipped(actualDF, expectedDF) actualDF3 = df3.generate() expectedDF3 = "select $t0.a from events $t0 inner join (select * from events $t1) $t2 on $t0.a = $t2.a where $t0.a = 2"
def test_groupByTableAggStar(self): df = grizzly.read_table("events") g = df.groupby("year") a = g.count() # print(f"cnt: {a}") self.assertEqual(a, 3)
def test_selectStarFilterString(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == 'abc'] actual = df.generate() expected = "select * from events $t0 where $t0.globaleventid = 'abc'" self.matchSnipped(actual, expected)
def test_Gt(self): df = grizzly.read_table("events") df = df[df['globaleventid'] > 468189636] actual = df.generate() expected = "select * from events $t0 where $t0.globaleventid > 468189636" self.matchSnipped(actual, expected)
def test_groupByTableAgg(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == 476829606] g = df.groupby(["year", "monthyear"]) a = g.count("monthyear") # print(f"cnt: {a}") self.assertEqual(a, 1)
def test_DistinctOneCol(self): df = grizzly.read_table("events") df = df['isrootevent'].distinct() actual = df.generate() # print(actual) expected = "select distinct $t1.isrootevent from events $t1" self.matchSnipped(actual, expected)
def test_groupby(self): df = grizzly.read_table("events") g = df.groupby(["year", "actor1name"]) a = g.agg(col="actor2name", aggType=AggregateType.MEAN) expected = "select $t0.year, $t0.actor1name, avg($t0.actor2name) from events $t0 group by $t0.year, $t0.actor1name" actual = a.generate() self.matchSnipped(actual, expected)
def test_triJoin(self): df1 = grizzly.read_table("t1") df2 = grizzly.read_table("t2") df3 = grizzly.read_table("t3") df3 = df3[["b", "d"]] j = df1.join(df2, on=(df1['a'] == df2['b']) & (df1['c'] <= df2['d']), how="left outer") j = j[[df1.m, df2.x]] j2 = j.join(df3, on=(j['m'] == df3['b']) & (j['x'] <= df3['d']), how="inner") actual = j2.generate() expected = "select $t1.m, $t2.x, $t4.b, $t4.d from t1 $t1 left outer join t2 $t2 on $t1.a = $t2.b and $t1.c <= $t2.d inner join (select $t3.b, $t3.d from t3 $t3) $t4 on $t1.m = $t4.b and $t1.x <= $t4.d" self.matchSnipped(actual, expected)
def test_selectStarGroupBy(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == '468189636'] g = df.groupby(["year", "monthyear"]) actual = g.generate() expected = "select $t0.year, $t0.monthyear from events $t0 where $t0.globaleventid = '468189636' group by $t0.year, $t0.monthyear" self.matchSnipped(actual, expected)
def test_selectColumnWithFilter(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == 468189636] df = df['goldsteinscale'] actual = df.generate() expected = "select $t0.goldsteinscale from events $t0 where $t0.globaleventid = 468189636" self.matchSnipped(actual, expected)
def test_groupByWithAggTwice(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == 476829606] g = df.groupby(["year", "monthyear"]) agged = g.agg(col="actor2geo_type", aggType=AggregateType.COUNT) aggActual = agged.generate() aggExpected = "select $t0.year, $t0.monthyear, count($t0.actor2geo_type) from events $t0 where $t0.globaleventid = 476829606 group by $t0.year, $t0.monthyear" self.matchSnipped(aggActual, aggExpected)
def test_toString(self): df = grizzly.read_table("events") df = df[df['globaleventid'] == 467268277] df = df[["actor1name", "actor2name", "globaleventid", "sourceurl"]] strDF = str(df) splt = strDF.split("\n") rows = df.count() # print(rows) dfLen = len(splt) rowsLen = rows + 3 self.assertEqual(dfLen, rowsLen) # column names + top rule + bottom rule
def test_parantheses(self): df = grizzly.read_table("t1") expr = (df['a'] == df['b']) & ((df['c'] <= df['d']) | ((df.f > 3) & (df.e != None))) self.assertIsInstance(expr, LogicExpr, "top expression should be AND") self.assertIsInstance(expr.left, BoolExpr, "first left should be EQ") self.assertIsInstance(expr.right, LogicExpr, "first right should be OR") andL = expr.left self.assertIsInstance(andL.left, ColRef, "left of EQ should be a ColRef") self.assertEqual(andL.left.column, "a", "left of EQ colref should be column a") self.assertIsInstance(andL.right, ColRef, "right of EQ should be a ColRef") self.assertEqual(andL.right.column, "b", "right of EQ colref should be column b") andR = expr.right self.assertIsInstance(andR.left, BoolExpr, "left of OR should be a LE") self.assertIsInstance(andR.right, LogicExpr, "right of OR should be another AND") self.assertIsInstance(andR.left.left, ColRef, "left of LE should be a ColRef") self.assertIsInstance(andR.left.right, ColRef, "right of LE should be a ColRef") self.assertEqual(andR.left.left.column, "c") self.assertEqual(andR.left.right.column, "d") innerAnd = expr.right.right self.assertIsInstance(innerAnd.left, BoolExpr) self.assertIsInstance(innerAnd.left.left, ColRef) self.assertEqual(innerAnd.left.left.column, "f") self.assertIsInstance(innerAnd.left.right, Constant) self.assertEqual(innerAnd.left.right.value, 3) self.assertIsInstance(innerAnd.right, BoolExpr) self.assertIsInstance(innerAnd.right.left, ColRef) self.assertEqual(innerAnd.right.left.column, "e") self.assertIsNone(innerAnd.right.right)
def test_show(self): df = grizzly.read_table("events") df = df[df['globaleventid'] <= 468189636] #== 467268277 df = df[["actor1name", "actor2name", "globaleventid", "sourceurl"]] from io import StringIO import sys try: bkp = sys.stdout sys.stdout = mystdout = StringIO() df.show(limit=None) output = mystdout.getvalue().splitlines() self.assertEqual(len(output), 2842 + 1) #+1 for column names finally: sys.stdout = bkp
def test_showPretty(self): df = grizzly.read_table("events") df = df[df['globaleventid'] <= 468189636] #== 467268277 df = df[["actor1name", "actor2name", "globaleventid", "sourceurl"]] from io import StringIO import sys try: bkp = sys.stdout sys.stdout = mystdout = StringIO() maxColWidth = 40 df.show(pretty=True, maxColWidth=maxColWidth) output = mystdout.getvalue().splitlines() for row in output: for col in row: self.assertLessEqual(len(col), maxColWidth) finally: sys.stdout = bkp
import grizzly import sqlite3 from grizzly.relationaldbexecutor import RelationalExecutor from grizzly.wrapper import wrapper from grizzly.aggregates import AggregateType con=wrapper.wrap(["./sample.xlsx","./sample.csv","./grizzly.db"]) grizzly.use(RelationalExecutor(con)) print("----------------show Excel Data sheet1------------------------") dfexcel = grizzly.read_table("sheet1") dfexcel.show(pretty=True) print("----------------Aggregate function on excel------------------------") g= dfexcel.groupby(["Country","Product"]) a = g.agg(col="Product",aggType=AggregateType.COUNT) print(a.generate()) a.show(pretty=True) print("---------------show Excel Data sheet2-------------------------") dfexcel2 = grizzly.read_table("sheet2") dfexcel2.show(pretty=True) print("---------------join between two excel sheet-------------------------") j= dfexcel.join(dfexcel2, on=(dfexcel.Segment == dfexcel2.Segment), how='inner') print(j.generate()) j.show(pretty=True) print("---------------Show csv data-------------------------")
def test_selectCountStar(self): df = grizzly.read_table("events") self.assertEqual(df.count(), 30354)
def test_selectStar(self): df = grizzly.read_table("events") actual = df.generate() expected = "select * from events $t0" self.matchSnipped(actual, expected)
def test_selectCountCol(self): df = grizzly.read_table("events") cnt = df.count('actor2name') self.assertGreater(cnt, 0)
def test_DistinctAll(self): df = grizzly.read_table("events") df = df.distinct() actual = df.generate() expected = "SELECT distinct * FROM events $t1".lower() self.matchSnipped(actual, expected)
import grizzly import sqlite3 from grizzly.relationaldbexecutor import RelationalExecutor con = sqlite3.connect("grizzly.db") grizzly.use(RelationalExecutor(con)) df = grizzly.read_table("events") df[df["globaleventid"] == 470747760] # filter df = df[["actor1name", "actor2name"]] df.show(pretty=True) print("----------------------------------------") df1 = grizzly.read_table("t1") df2 = grizzly.read_table("t2") j = df1.join(df2, on=(df1.actor1name == df2.actor2name) | (df1["actor1countrycode"] <= df2["actor2countrycode"]), how="left outer") print(j.generate()) cnt = j.count() print(f"join result contais {cnt} elments") print("----------------------------------------") df = grizzly.read_table("events")
def test_DistinctTwoCols(self): df = grizzly.read_table("events") df = df[['y', "x"]].distinct() actual = df.generate() expected = "select distinct $t0.y, $t0.x from events $t0" self.matchSnipped(actual, expected)
def test_collectWithHeader(self): df = grizzly.read_table("events") arr = df.collect(includeHeader=True) self.assertEqual(len(arr), 30354 + 1)