def testCachePersist(self): expr = self.odps_df data2 = [['name1', 3.2], ['name3', 2.4]] table_name = tn('pyodps_test_mixed_engine_cp_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(['name', 'fid'], ['string', 'double'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache() output_table = tn('pyodps_test_mixed_engine_cp_output_table') self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string']) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition='ds=today', create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected)) schema = Schema.from_lists( [c.name for c in self.t.schema.columns if c.name != 'name'], [c.type for c in self.t.schema.columns if c.name != 'name'], ['name'], ['string']) t = self.odps.create_table( 'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema) try: expr = self.odps_df.union(self.pd_df) expr.persist(t.name, create_table=False, partitions=['name']) self.assertEqual(self.engine.execute(DataFrame(t).count()), 5) self.engine._selecter.force_odps = False df = DataFrame(t) self.assertGreaterEqual( len( self.engine.execute(df.filter(df.name > 'a', df.name < 'b'))), 0) finally: t.drop()
def testCachePersist(self): expr = self.odps_df data2 = [["name1", 3.2], ["name3", 2.4]] table_name = tn("pyodps_test_mixed_engine_cp_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"]) ) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache() output_table = tn("pyodps_test_mixed_engine_cp_output_table") self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"]) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition="ds=today", create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop()
def testFetchTableSize(self): df = DataFrame(self.table) expr = df.filter_parts(self.pt) dag = expr.to_dag(copy=False) self.assertGreater(fetch_data_source_size(dag, df, self.table), 0) expr = df.filter_parts('ds=today,hh=curr,mm=now') dag = expr.to_dag(copy=False) self.assertGreater(fetch_data_source_size(dag, df, self.table), 0) expr = df.filter_parts('ds=today,hh=curr,mm=now2') dag = expr.to_dag(copy=False) self.assertIsNone(fetch_data_source_size(dag, df, self.table)) expr = df.filter_parts('ds=today,hh=curr') dag = expr.to_dag(copy=False) self.assertIsNone(fetch_data_source_size(dag, df, self.table)) expr = df.filter_parts('ds=today,mm=now') dag = expr.to_dag(copy=False) self.assertGreater(fetch_data_source_size(dag, df, self.table), 0) expr = df.filter(df.ds == 'today', df.mm == 'now', df.hh == 'curr') dag = expr.to_dag(copy=False) self.assertGreater(fetch_data_source_size(dag, df, self.table), 0) expr = df.filter(df.ds == 'today', df.hh == 'curr', df.mm == 'now') dag = expr.to_dag(copy=False) self.assertGreater(fetch_data_source_size(dag, df, self.table), 0) expr = df.filter(df.ds == 'today', df.hh == 'curr', df.mm == 'now2') dag = expr.to_dag(copy=False) self.assertIsNone(fetch_data_source_size(dag, df, self.table)) expr = df.filter(df.ds == 'today', df.hh == 'curr') dag = expr.to_dag(copy=False) self.assertIsNone(fetch_data_source_size(dag, df, self.table)) expr = df.filter(df.ds == 'today', df.mm == 'now') dag = expr.to_dag(copy=False) self.assertGreater(fetch_data_source_size(dag, df, self.table), 0)