コード例 #1
0
    def testBatch(self):
        if self.sql_engine.name == 'mysql':
            # TODO: mysqldb is not thread-safe, skip first
            return

        data = self._gen_data(10, value_range=(-1000, 1000))

        expr = self.expr[self.expr.id < 10].cache()
        expr1 = expr.id.sum()
        expr2 = expr.id.mean()

        dag = self.engine.compile([expr1, expr2])
        self.assertEqual(len(dag.nodes()), 3)
        self.assertEqual(sum(len(v) for v in dag._graph.values()), 2)

        expect1 = sum(d[1] for d in data if d[1] < 10)
        length = len([d[1] for d in data if d[1] < 10])
        expect2 = (expect1 / float(length)) if length > 0 else 0.0

        res = self.engine.execute([expr1, expr2], n_parallel=2)
        self.assertEqual(res[0], expect1)
        self.assertAlmostEqual(res[1], expect2, delta=0.001)
        self.assertTrue(context.is_cached(expr))

        # test async and timeout
        expr = self.expr[self.expr.id < 10]
        expr1 = expr.id.sum()
        expr2 = expr.id.mean()

        fs = self.engine.execute([expr, expr1, expr2], n_parallel=2, async=True, timeout=1)
        self.assertEqual(len(fs), 3)

        self.assertEqual(fs[1].result(), expect1)
        self.assertAlmostEqual(fs[2].result(), expect2, delta=0.001)
        self.assertTrue(context.is_cached(expr))
コード例 #2
0
    def testHeadAndTail(self):
        res = self.odps_df.head(2)
        self.assertEqual(len(res), 2)

        df = self.odps_df[self.odps_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertTrue(context.is_cached(df))

        res = self.odps_df.tail(2)
        self.assertEqual(len(res), 2)
        self.assertTrue(all(it > 1 for it in res.values['id']))

        self.assertEqual(len(self.odps_df.name.head(2)), 2)
        self.assertEqual(len(self.odps_df.name.tail(2)), 2)

        res = self.pd_df.head(1)
        self.assertEqual(len(res), 1)

        df = self.pd_df[self.pd_df['name'] == 'name1']
        res = df.head(1)
        self.assertEqual(len(res), 1)
        self.assertTrue(context.is_cached(df))

        res = self.pd_df.tail(1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res.values['id'][0], 6)

        self.assertEqual(len(self.pd_df.name.head(1)), 1)
        self.assertEqual(len(self.pd_df.name.tail(1)), 1)

        class TunnelOnlyODPSEngine(ODPSSQLEngine):
            def _do_execute(self, *args, **kwargs):
                kwargs['_force_tunnel'] = True
                return super(TunnelOnlyODPSEngine,
                             self)._do_execute(*args, **kwargs)

        engine = MixedEngine(self.odps)
        engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps)

        res = engine.execute(self.odps_df['id'], head=3)
        self.assertIsNotNone(res)
        self.assertEqual(sum(res.values['id']), 6)

        table_name = tn('pyodps_df_mixed2')
        self.odps.delete_table(table_name, if_exists=True)
        table = next(self.odps_df.data_source())
        table2 = self.odps.create_table(table_name, table.schema)
        try:
            res = DataFrame(table2).head(10)
            self.assertEqual(len(res), 0)
        finally:
            table2.drop()
コード例 #3
0
    def testPandasGroupbyFilter(self):
        import pandas as pd

        data = [
            [2001, 1],
            [2002, 2],
            [2003, 3]
        ]
        df = DataFrame(pd.DataFrame(data, columns=['id', 'fid']))

        df2 = df.groupby('id').agg(df.fid.sum())
        df3 = df2[df2.id == 2003]

        expected = [
            [2003, 3]
        ]

        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df2 = df.groupby('id').agg(df.fid.sum())
        df2.execute()
        self.assertTrue(context.is_cached(df2))
        df3 = df2[df2.id == 2003]

        self.assertEqual(df3.execute().values.values.tolist(), expected)
        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df4 = df.fid.sum()
        self.assertEqual(df4.execute(), 6)
        self.assertEqual(df4.execute(), 6)
コード例 #4
0
    def testHandleCache(self):
        df = self.pd_df['name', self.pd_df.id + 1]
        df.execute()
        self.assertTrue(context.is_cached(df))

        df2 = df[df.id < 10]
        dag = self.engine.compile(df2)
        self.assertEqual(len(dag.nodes()), 1)
        self.assertTrue(is_source_collection(dag.nodes()[0].expr.input))

        df3 = self.pd_df[self.pd_df.id < 10].count()
        i = df3.execute()
        self.assertTrue(context.is_cached(df3))

        df4 = df3 + 1
        dag = self.engine.compile(df4)
        self.assertEqual(len(dag.nodes()), 1)
        self.assertIsNotNone(dag.nodes()[0].expr._fields[0].lhs.value)
        self.assertEqual(df4.execute(), i + 1)
コード例 #5
0
    def testUseCache(self):
        self.engine._selecter.force_odps = True

        df_cache = self.odps_df[self.odps_df['name'] == 'name1'].cache()
        df = df_cache[df_cache.id * 2, df_cache.exclude('id')]
        self.assertEqual(len(self.engine.execute(df, head=10)), 2)

        context.get_cached(df_cache).drop()

        self.assertEqual(len(self.engine.execute(df_cache['name', df_cache.id * 2], head=10)), 2)
        self.assertTrue(context.is_cached(df_cache))
        self.assertTrue(self.odps.exist_table(context.get_cached(df_cache).name))
コード例 #6
0
    def testExecuteCacheTable(self):
        df = self.odps_df[self.odps_df.name == 'name1']
        result = df.execute().values
        self.assertEqual(len(result), 2)
        self.assertTrue(context.is_cached(df))

        dag = self.engine.compile(df)
        calls = dag.topological_sort()
        self.assertEqual(len(calls), 1)
        self.assertTrue(is_source_collection(calls[0].expr))

        df2 = df[:5]
        result = df2.execute()
        self.assertEqual(len(result), 2)
コード例 #7
0
    def testCache(self):
        data = self._gen_data(10, value_range=(-1000, 1000))

        expr = self.expr[self.expr.id < 10].cache()
        cnt = expr.count()

        dag = self.engine.compile(expr)
        self.assertEqual(len(dag.nodes()), 2)

        res = self.engine.execute(cnt)
        self.assertEqual(len([it for it in data if it[1] < 10]), res)
        self.assertTrue(context.is_cached(expr))

        table = context.get_cached(expr)
        self.assertIsInstance(table, SeahawksTable)