Exemple #1
0
    def testCachePersist(self):
        expr = self.odps_df

        data2 = [['name1', 3.2], ['name3', 2.4]]

        table_name = tn('pyodps_test_mixed_engine_cp_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(['name', 'fid'], ['string', 'double']))
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache()

        output_table = tn('pyodps_test_mixed_engine_cp_output_table')
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'],
                                   ['string'])
        output_t = self.odps.create_table(output_table,
                                          schema,
                                          if_not_exists=True)

        t = joined.persist(output_table,
                           partition='ds=today',
                           create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))

        schema = Schema.from_lists(
            [c.name for c in self.t.schema.columns if c.name != 'name'],
            [c.type for c in self.t.schema.columns if c.name != 'name'],
            ['name'], ['string'])
        t = self.odps.create_table(
            'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema)
        try:
            expr = self.odps_df.union(self.pd_df)
            expr.persist(t.name, create_table=False, partitions=['name'])

            self.assertEqual(self.engine.execute(DataFrame(t).count()), 5)

            self.engine._selecter.force_odps = False
            df = DataFrame(t)
            self.assertGreaterEqual(
                len(
                    self.engine.execute(df.filter(df.name > 'a',
                                                  df.name < 'b'))), 0)
        finally:
            t.drop()
    def testCachePersist(self):
        expr = self.odps_df

        data2 = [["name1", 3.2], ["name3", 2.4]]

        table_name = tn("pyodps_test_mixed_engine_cp_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
        )
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()

        output_table = tn("pyodps_test_mixed_engine_cp_output_table")
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition="ds=today", create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
Exemple #4
0
    def testFetchTableSize(self):
        df = DataFrame(self.table)

        expr = df.filter_parts(self.pt)
        dag = expr.to_dag(copy=False)
        self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)

        expr = df.filter_parts('ds=today,hh=curr,mm=now')
        dag = expr.to_dag(copy=False)
        self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)

        expr = df.filter_parts('ds=today,hh=curr,mm=now2')
        dag = expr.to_dag(copy=False)
        self.assertIsNone(fetch_data_source_size(dag, df, self.table))

        expr = df.filter_parts('ds=today,hh=curr')
        dag = expr.to_dag(copy=False)
        self.assertIsNone(fetch_data_source_size(dag, df, self.table))

        expr = df.filter_parts('ds=today,mm=now')
        dag = expr.to_dag(copy=False)
        self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)

        expr = df.filter(df.ds == 'today', df.mm == 'now', df.hh == 'curr')
        dag = expr.to_dag(copy=False)
        self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)

        expr = df.filter(df.ds == 'today', df.hh == 'curr', df.mm == 'now')
        dag = expr.to_dag(copy=False)
        self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)

        expr = df.filter(df.ds == 'today', df.hh == 'curr', df.mm == 'now2')
        dag = expr.to_dag(copy=False)
        self.assertIsNone(fetch_data_source_size(dag, df, self.table))

        expr = df.filter(df.ds == 'today', df.hh == 'curr')
        dag = expr.to_dag(copy=False)
        self.assertIsNone(fetch_data_source_size(dag, df, self.table))

        expr = df.filter(df.ds == 'today', df.mm == 'now')
        dag = expr.to_dag(copy=False)
        self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)