def testBatch(self): if self.sql_engine.name == 'mysql': # TODO: mysqldb is not thread-safe, skip first return data = self._gen_data(10, value_range=(-1000, 1000)) expr = self.expr[self.expr.id < 10].cache() expr1 = expr.id.sum() expr2 = expr.id.mean() dag = self.engine.compile([expr1, expr2]) self.assertEqual(len(dag.nodes()), 3) self.assertEqual(sum(len(v) for v in dag._graph.values()), 2) expect1 = sum(d[1] for d in data if d[1] < 10) length = len([d[1] for d in data if d[1] < 10]) expect2 = (expect1 / float(length)) if length > 0 else 0.0 res = self.engine.execute([expr1, expr2], n_parallel=2) self.assertEqual(res[0], expect1) self.assertAlmostEqual(res[1], expect2, delta=0.001) self.assertTrue(context.is_cached(expr)) # test async and timeout expr = self.expr[self.expr.id < 10] expr1 = expr.id.sum() expr2 = expr.id.mean() fs = self.engine.execute([expr, expr1, expr2], n_parallel=2, async=True, timeout=1) self.assertEqual(len(fs), 3) self.assertEqual(fs[1].result(), expect1) self.assertAlmostEqual(fs[2].result(), expect2, delta=0.001) self.assertTrue(context.is_cached(expr))
def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertTrue(context.is_cached(df)) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values['id'])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertTrue(context.is_cached(df)) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values['id'][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) class TunnelOnlyODPSEngine(ODPSSQLEngine): def _do_execute(self, *args, **kwargs): kwargs['_force_tunnel'] = True return super(TunnelOnlyODPSEngine, self)._do_execute(*args, **kwargs) engine = MixedEngine(self.odps) engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps) res = engine.execute(self.odps_df['id'], head=3) self.assertIsNotNone(res) self.assertEqual(sum(res.values['id']), 6) table_name = tn('pyodps_df_mixed2') self.odps.delete_table(table_name, if_exists=True) table = next(self.odps_df.data_source()) table2 = self.odps.create_table(table_name, table.schema) try: res = DataFrame(table2).head(10) self.assertEqual(len(res), 0) finally: table2.drop()
def testPandasGroupbyFilter(self): import pandas as pd data = [ [2001, 1], [2002, 2], [2003, 3] ] df = DataFrame(pd.DataFrame(data, columns=['id', 'fid'])) df2 = df.groupby('id').agg(df.fid.sum()) df3 = df2[df2.id == 2003] expected = [ [2003, 3] ] self.assertEqual(df3.execute().values.values.tolist(), expected) df2 = df.groupby('id').agg(df.fid.sum()) df2.execute() self.assertTrue(context.is_cached(df2)) df3 = df2[df2.id == 2003] self.assertEqual(df3.execute().values.values.tolist(), expected) self.assertEqual(df3.execute().values.values.tolist(), expected) df4 = df.fid.sum() self.assertEqual(df4.execute(), 6) self.assertEqual(df4.execute(), 6)
def testHandleCache(self): df = self.pd_df['name', self.pd_df.id + 1] df.execute() self.assertTrue(context.is_cached(df)) df2 = df[df.id < 10] dag = self.engine.compile(df2) self.assertEqual(len(dag.nodes()), 1) self.assertTrue(is_source_collection(dag.nodes()[0].expr.input)) df3 = self.pd_df[self.pd_df.id < 10].count() i = df3.execute() self.assertTrue(context.is_cached(df3)) df4 = df3 + 1 dag = self.engine.compile(df4) self.assertEqual(len(dag.nodes()), 1) self.assertIsNotNone(dag.nodes()[0].expr._fields[0].lhs.value) self.assertEqual(df4.execute(), i + 1)
def testUseCache(self): self.engine._selecter.force_odps = True df_cache = self.odps_df[self.odps_df['name'] == 'name1'].cache() df = df_cache[df_cache.id * 2, df_cache.exclude('id')] self.assertEqual(len(self.engine.execute(df, head=10)), 2) context.get_cached(df_cache).drop() self.assertEqual(len(self.engine.execute(df_cache['name', df_cache.id * 2], head=10)), 2) self.assertTrue(context.is_cached(df_cache)) self.assertTrue(self.odps.exist_table(context.get_cached(df_cache).name))
def testExecuteCacheTable(self): df = self.odps_df[self.odps_df.name == 'name1'] result = df.execute().values self.assertEqual(len(result), 2) self.assertTrue(context.is_cached(df)) dag = self.engine.compile(df) calls = dag.topological_sort() self.assertEqual(len(calls), 1) self.assertTrue(is_source_collection(calls[0].expr)) df2 = df[:5] result = df2.execute() self.assertEqual(len(result), 2)
def testCache(self): data = self._gen_data(10, value_range=(-1000, 1000)) expr = self.expr[self.expr.id < 10].cache() cnt = expr.count() dag = self.engine.compile(expr) self.assertEqual(len(dag.nodes()), 2) res = self.engine.execute(cnt) self.assertEqual(len([it for it in data if it[1] < 10]), res) self.assertTrue(context.is_cached(expr)) table = context.get_cached(expr) self.assertIsInstance(table, SeahawksTable)