def persist(self, line): try: import pandas as pd has_pandas = True except ImportError: has_pandas = False self._set_odps() line = line.strip().strip(';') frame_name, table_name = line.split(None, 1) if '.' in table_name: project_name, table_name = tuple(table_name.split('.', 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if self._odps.exist_table(table_name, project=project_name): raise TypeError('%s already exists' % table_name) if isinstance(frame, DataFrame): frame.persist(name=table_name, project=project_name) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) frame.persist(name=table_name, project=project_name)
def persist(self, line): try: import pandas as pd has_pandas = True except ImportError: has_pandas = False self._set_odps() line = line.strip().strip(';') frame_name, table_name = line.split(None, 1) if '.' in table_name: project_name, table_name = tuple(table_name.split('.', 1)) else: project_name = None frame = self.shell.user_ns[frame_name] if self._odps.exist_table(table_name, project=project_name): raise TypeError('%s already exists' % table_name) if isinstance(frame, DataFrame): frame.persist(name=table_name, project=project_name, notify=False) elif has_pandas and isinstance(frame, pd.DataFrame): frame = DataFrame(frame) frame.persist(name=table_name, project=project_name, notify=False) html_notify('Persist succeeded')
def testPandasPersistODPS2(self): import pandas as pd import numpy as np data_int8 = np.random.randint(0, 10, (1, ), dtype=np.int8) data_int16 = np.random.randint(0, 10, (1, ), dtype=np.int16) data_int32 = np.random.randint(0, 10, (1, ), dtype=np.int32) data_int64 = np.random.randint(0, 10, (1, ), dtype=np.int64) data_float32 = np.random.random((1, )).astype(np.float32) data_float64 = np.random.random((1, )).astype(np.float64) df = DataFrame( pd.DataFrame( OrderedDict([('data_int8', data_int8), ('data_int16', data_int16), ('data_int32', data_int32), ('data_int64', data_int64), ('data_float32', data_float32), ('data_float64', data_float64)]))) tmp_table_name = tn('pyodps_test_mixed_persist_odps2_types') self.odps.delete_table(tmp_table_name, if_exists=True) df.persist(tmp_table_name, lifecycle=1, drop_table=True, odps=self.odps) t = self.odps.get_table(tmp_table_name) expected_types = [ odps_types.tinyint, odps_types.smallint, odps_types.int_, odps_types.bigint, odps_types.float_, odps_types.double ] self.assertEqual(expected_types, t.schema.types)
def testExistingPersist(self): self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)).append_id() odps_schema = df_schema_to_odps_schema(df.schema) cols = list(reversed(odps_schema.columns)) odps_schema = Schema.from_lists([c.name for c in cols], [c.type for c in cols]) self.odps.delete_table(EXISTING_PERSIST_TABLE, if_exists=True) self.odps.create_table(EXISTING_PERSIST_TABLE, odps_schema) df.persist(EXISTING_PERSIST_TABLE)
class Test(TestBase): def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [['name1', 5], ['name2', 6]] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed') self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps) def teardown(self): self.t.drop() def testGroupReduction(self): expr = self.odps_df.select(self.odps_df, id2=self.odps_df.id.map(lambda x: x + 1)) expr = expr.groupby('name').id2.sum() expected = [['name1', 6], ['name2', 3]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result)) def assertPandasEqual(self, df1, df2): from odps.compat import six from odps import types as o_types from pandas.util.testing import assert_frame_equal # compare column types def get_odps_type(p_type): for data_type, builtin_type in six.iteritems( o_types._odps_primitive_to_builtin_types): if issubclass(p_type.type, builtin_type): return data_type types1 = [get_odps_type(dt) for dt in df1.dtypes] types2 = [get_odps_type(dt) for dt in df2.dtypes] self.assertSequenceEqual(types1, types2) assert_frame_equal(df1, df2, check_dtype=False) def testJoin(self): expr = self.odps_df.join(self.pd_df, 'name').sort('id_x') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected)) def testIsIn(self): expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df['name'].isin( self.pd_df['name']).rename('isin')).values self.assertTrue(result.equals(expected)) def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) expr = expr[expr['name'].isin(self.pd_df['name'])] result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union( df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected)) def testPandasPersist(self): import pandas as pd, numpy as np self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) self.odps.delete_table(tmp_table_name) def testExecuteCacheTable(self): df = self.odps_df[self.odps_df.name == 'name1'] result = df.execute().values self.assertEqual(len(result), 2) self.assertIsNotNone(df._cache_data) _, new_df, cbs = self.engine._compile(df) try: self.assertIsNotNone(new_df._source_data) finally: [cb() for cb in cbs] df2 = df[:5] result = df2.execute() self.assertEqual(len(result), 2) def testHandleCache(self): df = self.pd_df['name', self.pd_df.id + 1] df.execute() self.assertIsNotNone(df._cache_data) df2 = df[df.id < 10] _, new_df2, cbs = self.engine._compile(df2) try: self.assertIsNotNone(new_df2.input._source_data) finally: [cb() for cb in cbs] def testCacheTable(self): df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = self.engine._compile_dag(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = df._cache_data self.assertEqual(len(df.execute()), len(expected)) self.assertIs(df._cache_data, table) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(df4._cache_data, 2) def testUseCache(self): df = self.odps_df[self.odps_df['name'] == 'name1'] self.assertEqual(len(df.head(10)), 2) df._cache_data.drop() self.assertRaises(ODPSError, lambda: self.engine.execute(df['name', 'id'])) def plot(**_): pass self.assertRaises(ODPSError, lambda: df.plot(x='id', plot_func=plot)) def testPivot(self): data = [['name1', 1, 1.0, True], ['name1', 2, 2.0, True], ['name2', 1, 3.0, False], ['name2', 3, 4.0, False]] table_name = tn('pyodps_test_mixed_engine_pivot') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists( ['name', 'id', 'fid', 'ismale'], ['string', 'bigint', 'double', 'boolean'])) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot(rows='id', columns='name', values='fid').distinct() res = self.engine.execute(expr1) result = self._get_result(res) expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot(rows='id', columns='name', values=['fid', 'ismale']) res = self.engine.execute(expr2) result = self._get_result(res) expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]] self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot(rows='id', columns='name', values='fid')['name3'] with self.assertRaises(ValueError) as cm: self.engine.execute(expr3) self.assertIn('name3', str(cm.exception)) expr4 = expr.pivot(rows='id', columns='name', values='fid')['id', 'name1'] res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 1.0], [2, 2.0], [3, None]] self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot(rows='id', columns='name', values='fid') expr5 = expr5[expr5, (expr5['name1'].astype('int') + 1).rename('new_name')] res = self.engine.execute(expr5) result = self._get_result(res) expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]] self.assertEqual(sorted(result), sorted(expected)) expr6 = expr.pivot(rows='id', columns='name', values='fid') expr6 = expr6.join(self.odps_df, on='id')[expr6, 'name'] res = self.engine.execute(expr6) result = self._get_result(res) expected = [[1, 1.0, 3.0, 'name1'], [2, 2.0, None, 'name2'], [3, None, 4.0, 'name1']] self.assertEqual(sorted(result), sorted(expected)) finally: table.drop() def testPivotTable(self): data = [['name1', 1, 1.0, True], ['name1', 1, 5.0, True], ['name1', 2, 2.0, True], ['name2', 1, 3.0, False], ['name2', 3, 4.0, False]] table_name = tn('pyodps_test_mixed_engine_pivot_table') self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists( ['name', 'id', 'fid', 'ismale'], ['string', 'bigint', 'double', 'boolean'])) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot_table(rows='name', values='fid') res = self.engine.execute(expr1) result = self._get_result(res) expected = [ ['name1', 8.0 / 3], ['name2', 3.5], ] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot_table(rows='name', values='fid', aggfunc=['mean', 'sum']) res = self.engine.execute(expr2) result = self._get_result(res) expected = [ ['name1', 8.0 / 3, 8.0], ['name2', 3.5, 7.0], ] self.assertEqual(res.schema.names, ['name', 'fid_mean', 'fid_sum']) self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot_table(rows='id', values='fid', columns='name', fill_value=0).distinct() res = self.engine.execute(expr3) result = self._get_result(res) expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]] self.assertEqual(res.schema.names, ['id', 'name1_fid_mean', 'name2_fid_mean']) self.assertEqual(result, expected) class Agg(object): def buffer(self): return [0] def __call__(self, buffer, val): buffer[0] += val def merge(self, buffer, pbuffer): buffer[0] += pbuffer[0] def getvalue(self, buffer): return buffer[0] aggfuncs = OrderedDict([('my_sum', Agg), ('mean', 'mean')]) expr4 = expr.pivot_table(rows='id', values='fid', columns='name', fill_value=0, aggfunc=aggfuncs) res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0], [3, 0, 4.0, 0, 4.0]] self.assertEqual(res.schema.names, [ 'id', 'name1_fid_my_sum', 'name2_fid_my_sum', 'name1_fid_mean', 'name2_fid_mean' ]) self.assertEqual(result, expected) finally: table.drop() def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values['id'])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values['id'][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) def testMapReduceWithResource(self): pd_df2 = self.odps_df.to_pandas(wrap=True) @output(['name', 'id'], ['string', 'int']) def reducer(resources): d = dict() for r in resources[0]: if r.name in d: d[r.name] += r.id else: d[r.name] = r.id def inner(keys): def h(row, done): if row.name in d: d[row.name] += row.id else: d[row.name] = row.id if done: yield row.name, d[row.name] return h return inner expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) odps_df2 = self.pd_df.persist(tn('pyodps_df_mixed2'), odps=self.odps) try: expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) finally: next(odps_df2.data_source()).drop() def testBloomFilter(self): import numpy as np data2 = [['name1'], ['name3']] table_name = tn('pyodps_test_mixed_engine_bf_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['name'], ['string'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) try: expr = self.odps_df.bloom_filter('name', expr2[:1].name, capacity=10) res = self.engine.execute(expr) self.assertTrue(np.all(res['name'] != 'name2')) finally: table2.drop() def testCachePersist(self): expr = self.odps_df data2 = [['name1', 3.2], ['name3', 2.4]] table_name = tn('pyodps_test_mixed_engine_cp_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(['name', 'fid'], ['string', 'double'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache() output_table = tn('pyodps_test_mixed_engine_cp_output_table') self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string']) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition='ds=today', create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop() def testBigintPartitionedCache(self): table = tn('pyodps_test_bigint_partitioned_cache') self.odps.delete_table(table, if_exists=True) expr = self.odps_df.persist(table, partitions=['id']) @output(['id', 'name'], ['int', 'string']) def handle(row): return row.id + 1, row.name expr = expr['tt' + expr.name, expr.id].cache() new_expr = expr.map_reduce(mapper=handle) res = self.engine.execute(new_expr) self.assertEqual(len(res), 3)
vid_gt_4 = odps.get_table('tl_gt_4_vid_6_6').to_df().to_pandas()['vid'] predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] use_features = [ t for t in train.columns if t != 'vid' and t not in predict_features and t != 'pos_4' and not 'log' in t ] pos_eq_1 = test[test['vid'].isin(vid_gt_4)] test_eq_1 = pos_eq_1.loc[:, use_features] submission_gt_4 = pos_eq_1.loc[:, ['vid', 'tl']] train_gt_4 = train[train['tl'] >= 4] train_gt_4.index = list(range(train_gt_4.shape[0])) model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=800, max_depth=5, subsample=0.8, random_state=1, verbose=1, min_samples_leaf=20) gbdt_model(train_gt_4, 'tl', use_features, test_eq_1, submission_gt_4, model) gt_4_index = submission[submission['vid'].isin( submission_gt_4['vid'])].index submission_temp = submission.loc[gt_4_index, ['vid', 'tl']] merge_fat = pd.merge(submission_temp, submission_gt_4, on='vid') temp_columns = [tc for tc in merge_fat.columns if tc != 'vid'] replace_num = np.max(merge_fat.loc[:, temp_columns], axis=1) submission.loc[gt_4_index, 'tl'] = replace_num.values print(submission.sort_values(by=['tl'], ascending=False)) sub_final = DataFrame(submission) sub_final.persist('tl_jz_5_fold_6_6_22_submit_modified_high_value')
class Test(TestBase): def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [ ['name1', 5], ['name2', 6] ] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed') self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps) def teardown(self): self.t.drop() def assertPandasEqual(self, df1, df2): from odps.compat import six from odps import types as o_types from pandas.util.testing import assert_frame_equal # compare column types def get_odps_type(p_type): for data_type, builtin_type in six.iteritems(o_types._odps_primitive_to_builtin_types): if issubclass(p_type.type, builtin_type): return data_type types1 = [get_odps_type(dt) for dt in df1.dtypes] types2 = [get_odps_type(dt) for dt in df2.dtypes] self.assertSequenceEqual(types1, types2) assert_frame_equal(df1, df2, check_dtype=False) def testJoin(self): expr = self.odps_df.join(self.pd_df, 'name').sort('id_x') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected)) def testIsIn(self): expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df['name'].isin(self.pd_df['name']).rename('isin')).values self.assertTrue(result.equals(expected)) def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, 'name')[ lambda x: x.name, lambda x: x.id_x.rename('id') ]).sort(['name', 'id']) expr = expr[expr['name'].isin(self.pd_df['name'])] result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union( df.join(self.pd_df, 'name')[ lambda x: x.name, lambda x: x.id_x.rename('id') ]).sort(['name', 'id']) test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected)) def testPandasPersist(self): import pandas as pd, numpy as np self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) self.odps.delete_table(tmp_table_name) def testExecuteCacheTable(self): df = self.odps_df[self.odps_df.name == 'name1'] result = df.execute().values self.assertEqual(len(result), 2) self.assertIsNotNone(df._cache_data) new_df = self.engine._pre_process(df) _, new_df, cbs = self.engine._compile(new_df) try: self.assertIsNotNone(new_df._source_data) finally: [cb() for cb in cbs] df2 = df[:5] result = df2.execute() self.assertEqual(len(result), 2) def testHandleCache(self): df = self.pd_df['name', self.pd_df.id + 1] df.execute() self.assertIsNotNone(df._cache_data) df2 = df[df.id < 10] new_df2 = self.engine._pre_process(df2) _, new_df2, cbs = self.engine._compile(new_df2) try: self.assertIsNotNone(new_df2.input._source_data) finally: [cb() for cb in cbs] def testCacheTable(self): df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = self.engine._compile_dag(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = df._cache_data self.assertEqual(len(df.execute()), len(expected)) self.assertIs(df._cache_data, table) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(df4._cache_data, 2) def testUseCache(self): df = self.odps_df[self.odps_df['name'] == 'name1'] self.assertEqual(len(df.head(10)), 2) df._cache_data.drop() self.assertRaises(ODPSError, lambda: self.engine.execute(df['name', 'id'])) def plot(**_): pass self.assertRaises(ODPSError, lambda: df.plot(x='id', plot_func=plot)) def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values['id'])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values['id'][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) def testMapReduceWithResource(self): pd_df2 = self.odps_df.to_pandas(wrap=True) @output(['name', 'id'], ['string', 'int']) def reducer(resources): d = dict() for r in resources[0]: if r.name in d: d[r.name] += r.id else: d[r.name] = r.id def inner(keys): def h(row, done): if row.name in d: d[row.name] += row.id else: d[row.name] = row.id if done: yield row.name, d[row.name] return h return inner expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) odps_df2 = self.pd_df.persist('pyodps_df_mixed2', odps=self.odps) try: expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) finally: next(odps_df2.data_source()).drop() def testBloomFilter(self): import numpy as np data2 = [ ['name1'], ['name3'] ] table_name = tn('pyodps_test_mixed_engine_bf_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['name'], ['string'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, [table2.new_record(values=d) for d in data2]) try: expr = self.odps_df.bloom_filter('name', expr2[:1].name, capacity=10) res = self.engine.execute(expr) self.assertTrue(np.all(res['name'] != 'name2')) finally: table2.drop()
'juz_train_6_6_add_wzm_for145_final').to_df().to_pandas() test = odps.get_table( 'juz_test_6_6_add_wzm_for145_final').to_df().to_pandas() print(train.shape, test.shape) gene_list = get_one_hot_list(train, use_label, 14) train.replace(-999, np.nan, inplace=True) test.replace(-999, np.nan, inplace=True) drop_snp = [s for s in train.columns if 'snp' in s] train.drop(drop_snp, axis=1, inplace=True) test.drop(drop_snp, axis=1, inplace=True) gene_data = odps.get_table('meinian_round2_snp').to_df().to_pandas() snp_data = pd.get_dummies(gene_data.loc[:, gene_list]) snp_data['vid'] = gene_data['vid'].values for s in snp_data.columns: if s != 'vid': snp_data[s] = snp_data[s].astype(int) train_merge = pd.merge(train, snp_data, on='vid', how='left') test_merge = pd.merge(test, snp_data, on='vid', how='left') train_merge.fillna(-999, inplace=True) test_merge.fillna(-999, inplace=True) print('final train shape:{}, test shape:{} '.format( train_merge.shape, test_merge.shape)) juz_train = DataFrame(train_merge) juz_test = DataFrame(test_merge) juz_train.persist('{}_juz_train_6_6_snp_onehot_22'.format(use_label)) juz_test.persist('{}_juz_test_6_6_snp_onehot_22'.format(use_label))
merge_df = "$".join(list(df['results'])) else: merge_df = df['results'].values[0] return merge_df # 数据简单处理 print(part_1_2.shape) is_happen = part_1_2.groupby(['vid', 'test_id']).size().reset_index() # 重塑index用来去重 is_happen['new_index'] = is_happen['vid'] + '_' + is_happen['test_id'] is_happen_new = is_happen[is_happen[0] > 1]['new_index'] part_1_2['new_index'] = part_1_2['vid'] + '_' + part_1_2['test_id'] unique_part = part_1_2[part_1_2['new_index'].isin(list(is_happen_new))] unique_part = unique_part.sort_values(['vid', 'test_id']) no_unique_part = part_1_2[~part_1_2['new_index'].isin(list(is_happen_new))] print('begin') part_1_2_not_unique = unique_part.groupby(['vid', 'test_id' ]).apply(merge_table).reset_index() part_1_2_not_unique.rename(columns={0: 'results'}, inplace=True) tmp = pd.concat( [part_1_2_not_unique, no_unique_part[['vid', 'test_id', 'results']]]) # 行列转换 print('finish') tmp = tmp.pivot(index='vid', values='results', columns='test_id') print(tmp.shape) combine_data = DataFrame(tmp, unknown_as_string=True) combine_data.persist('origin_data_combine_part1_part2') print('total time', time.time() - begin_time)
def gbdt_model(df, label, use_feature, true_test, submission_data, gbdt_model): print(submission_data.head()) print("基于GBDT: 开始训练 label 为{}...".format(label)) value4preds = df['pos_4'] train_data = df.loc[:, use_feature] print(train_data.shape, true_test.shape) pred_labels = np.zeros(df.shape[0]) submission_label = np.zeros((true_test.shape[0], 5)) kf = KFold(n_splits=5, shuffle=True, random_state=1024) five_fold_index = list(kf.split(train_data, value4preds)) train_index_1, test_index_1 = five_fold_index[0] print('第1次训练...') x_train_1, x_test_1 = train_data.iloc[train_index_1], train_data.iloc[test_index_1] y_train_1, y_test_1 = value4preds.iloc[train_index_1], value4preds.iloc[test_index_1] gbdt_model.fit(x_train_1, y_train_1) pred_labels[x_test_1.index] = np.where(gbdt_model.predict(x_test_1) > 0.5, 1, 0) submission_label[:, 0] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) print('第1次训练结束') print('*******************************************************************') train_index_2, test_index_2 = five_fold_index[1] print('第2次训练...') x_train_2, x_test_2 = train_data.iloc[train_index_2], train_data.iloc[test_index_2] y_train_2, y_test_2 = value4preds.iloc[train_index_2], value4preds.iloc[test_index_2] gbdt_model.fit(x_train_2, y_train_2) pred_labels[x_test_2.index] = np.where(gbdt_model.predict(x_test_2) > 0.5, 1, 0) submission_label[:, 1] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) print('第2次训练结束') print('*******************************************************************') train_index_3, test_index_3 = five_fold_index[2] print('第3次训练...') x_train_3, x_test_3 = train_data.iloc[train_index_3], train_data.iloc[test_index_3] y_train_3, y_test_3 = value4preds.iloc[train_index_3], value4preds.iloc[test_index_3] gbdt_model.fit(x_train_3, y_train_3) pred_labels[x_test_3.index] = np.where(gbdt_model.predict(x_test_3) > 0.5, 1, 0) submission_label[:, 2] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) print('第3次训练结束') print('*******************************************************************') train_index_4, test_index_4 = five_fold_index[3] print('第4次训练...') x_train_4, x_test_4 = train_data.iloc[train_index_4], train_data.iloc[test_index_4] y_train_4, y_test_4 = value4preds.iloc[train_index_4], value4preds.iloc[test_index_4] gbdt_model.fit(x_train_4, y_train_4) pred_labels[x_test_4.index] = np.where(gbdt_model.predict(x_test_4) > 0.5, 1, 0) submission_label[:, 3] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) print('第4次训练结束') print('*******************************************************************') train_index_5, test_index_5 = five_fold_index[4] print('第5次训练...') x_train_5, x_test_5 = train_data.iloc[train_index_5], train_data.iloc[test_index_5] y_train_5, y_test_5 = value4preds.iloc[train_index_5], value4preds.iloc[test_index_5] gbdt_model.fit(x_train_5, y_train_5) pred_labels[x_test_5.index] = np.where(gbdt_model.predict(x_test_5) > 0.5, 1, 0) submission_label[:, 4] = np.where(gbdt_model.predict(true_test) > 0.5, 1, 0) print('第5次训练结束') print('*******************************************************************') submission_data['pos_4'] = np.where(np.sum(submission_label, axis=1) >= 1, 1, 0) print(classification_report(pred_labels, value4preds)) print(submission_data[submission_data['pos_4']==1]) sub_class = DataFrame(submission_data[submission_data['pos_4']==1], unknown_as_string=True) sub_class.persist('tl_gt_4_vid_6_6')
data_frame['heart_times'] = data_frame['1001'].apply(extract_num_norm) data_frame['all_result'] = '_' for p in data_frame.columns: if p != 'vid': data_frame['all_result'] = data_frame[ 'all_result'] + '_' + data_frame[p].astype('str') data_frame['gender'] = data_frame['all_result'].apply(is_sex) del data_frame['all_result'] new_add = [ 'xue_ya_pian_gao', 'xue_zhi_pian_gao', 'xue_tang_pian_gao', 'high_sugar', 'guan_xin_bin', 'shen', 'smoke', 'niao', 'heart_rate', '3399_w', '3301_w', '0403_w', '0421_w', '0405_w', 'gender', 'blood_pipe_style', 'health', 'pres_front', 'pres_back', 'heart_times', 'vid', 'dannan_jieshi', 'dannan_xirou', 'shen_jieshi', 'shen_nanz', 'gan_nanz', 'gan_ying_hua' ] yy_list.extend(new_add) return data_frame.loc[:, yy_list] if __name__ == "__main__": part_1_2 = odps.get_table( 'origin_data_combine_part1_part2').to_df().to_pandas() word_data = word2num(part_1_2) print('the shape of word_data: ', word_data.shape) juz_word_data = DataFrame(word_data) juz_word_data.persist('juz_word_data_5_30')
num_ex_str = data_frame.loc[:, total] return num_ex_str if __name__ == "__main__": part_1_2 = odps.get_table( 'origin_data_combine_part1_part2').to_df().to_pandas() part_1_2['jia_zx'] = split_data(part_1_2['0101'], '甲状腺') part_1_2['left_shen'] = split_data(part_1_2['0117'], '左肾') part_1_2['right_shen'] = split_data(part_1_2['0118'], '右肾') part_1_2_copy = part_1_2.copy(deep=True) ex_num_data = ex_num_from_str(part_1_2) print('the shape of the num_data get from word: ', ex_num_data.shape) pure_num_data = get_pure_num_features(part_1_2_copy, 0.96) pure_columns = [p for p in pure_num_data.columns if p != 'vid'] ex_num_columns = [ i for i in ex_num_data.columns if i not in ['vid', '314', '1308', '1319', '1320', '1321', '1322', '0424', '0425'] ] print('the shape of origin num data: ', pure_num_data.shape) numeric_data = pd.merge(pure_num_data, ex_num_data, on='vid', how='inner') exm_drop = [] for w in pure_columns + ex_num_columns: if np.abs(numeric_data[w].skew()) > 12: exm_drop.append(w) print(exm_drop) numeric_data.drop(exm_drop, axis=1, inplace=True) print('total data shape: ', numeric_data.shape) juz_num_data = DataFrame(numeric_data) juz_num_data.persist('juz_num_data_5_31')
c2_test = test_merge.loc[(test_merge['2403']!=-999)&(test_merge['2404']==-999)&(test_merge['2405']!=-999), '2404'].index.values c3_test = test_merge.loc[(test_merge['2403']!=-999)&(test_merge['2404']!=-999)&(test_merge['2405']==-999), '2405'].index.values for i in c1_test: test_merge.loc[i, '2403'] = (test_merge.loc[i, '2404']/100) * (test_merge.loc[i, '2404']/100) * (test_merge.loc[i, '2405']) for i in c2_test: test_merge.loc[i, '2404'] = np.sqrt((test_merge.loc[i, '2403']) / (test_merge.loc[i, '2405'])) * 100 for i in c3_test: test_merge.loc[i, '2405'] = (test_merge.loc[i, '2403']) / ((test_merge.loc[i, '2404']/100) * (test_merge.loc[i, '2404']/100)) test_merge.loc[test_merge['2403']==0, '2403'] = -999 test_merge.loc[test_merge['2404']==0, '2404'] = -999 test_merge.loc[test_merge['2405']==0, '2405'] = -999 t_train = DataFrame(train_merge[e_c_cols+c_not_count+['table_count','result_len','result_avg_len','sugar_high_related','sex']]) t_train.persist('wzm_trainset3_3_b2') t_test = DataFrame(test_merge[e_c_cols+c_not_count+['table_count','result_len','result_avg_len','sugar_high_related','sex']]) t_test.persist('wzm_testset3_3_b2') ''' exclude_cols = ['vid', 'sys', 'dia', 'tl', 'hdl', 'ldl'] e_c_cols = exclude_cols + cate_cols + cate_cols2 t_train = DataFrame(train_merge[e_c_cols+['sugar_high_related']]) t_train.persist('wzm_trainset3_3_b2') t_test = DataFrame(test_merge[e_c_cols+['sugar_high_related']]) t_test.persist('wzm_testset3_3_b2')
'jz_xgb_pred_val_1').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] val_2 = odps.get_table( 'jz_xgb_pred_val_2').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] val_3 = odps.get_table( 'jz_xgb_pred_val_3').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] val_4 = odps.get_table( 'jz_xgb_pred_val_4').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] val_5 = odps.get_table( 'jz_xgb_pred_val_5').to_df().to_pandas().loc[:, ['vid', 'log_tl', 'result']] xgb_result = val_1.loc[:, ['vid']] xgb_result['tl'] = np.exp( (val_1['result'] + val_2['result'] + val_3['result'] + val_4['result'] + val_5['result']) / 5) test_odps = DataFrame(xgb_result) test_odps.persist('tl_xgb_result') ''' val = pd.concat([val_1, val_2, val_3, val_4, val_5]) print('fold 1: ', eval_metric(np.exp(val_1['result']), np.exp(val_1['log_tl']))) print('fold 2: ', eval_metric(np.exp(val_2['result']), np.exp(val_2['log_tl']))) print('fold 3: ', eval_metric(np.exp(val_3['result']), np.exp(val_3['log_tl']))) print('fold 4: ', eval_metric(np.exp(val_4['result']), np.exp(val_4['log_tl']))) print('fold 5: ', eval_metric(np.exp(val_5['result']), np.exp(val_5['log_tl']))) print('total loss: ', eval_metric(np.exp(val['result']), np.exp(val['log_tl']))) '''
print(gbdt_features) sfm_rf = SelectFromModel(rf, threshold=0.001) sfm_rf.fit_transform(x_train, label) rf_features = set(x_train.columns[sfm_rf.get_support()]) print('*************************************') print(rf_features) print(gbdt_features & rf_features) sfm_l2 = SelectFromModel(l2, threshold=0.5) sfm_l2.fit_transform(x_train, label) l2_features = set(x_train.columns[sfm_l2.get_support()]) print('*************************************') print(l2_features) final_features = list(gbdt_features | rf_features | l2_features) # choose top k features #final_features = list((gbdt_features & rf_features) | l2_features) print('gbdt model has {} features'.format(len(gbdt_features))) print('rf model has {} features'.format(len(rf_features))) print('l2 model has {} features'.format(len(l2_features))) print('final has {} features'.format(len(final_features))) print('*************************************') print(final_features) print('*************************************') final_features.extend(['vid', 'tl']) train_final = DataFrame(train.loc[:, final_features]) train_final.persist('combine_tl_train_6_2') test_final = DataFrame(test.loc[:, final_features]) test_final.persist('combine_tl_test_6_2')
return eval_metric(scores, np.exp(value4preds)) # b-board # 739 601 887 709 1221 # 'dia': 0.018069628693683809 if __name__ == "__main__": train = odps.get_table( 'dia_juz_train_6_6_snp_onehot_22').to_df().to_pandas() test = odps.get_table('dia_juz_test_6_6_snp_onehot_22').to_df().to_pandas() print(train.shape) print(test.shape) predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] use_features = [ t for t in train.columns if t != 'vid' and t not in predict_features ] test_data = test.loc[:, use_features] submission = test.loc[:, ['vid', 'dia']] base_line_score = np.zeros(5) start = time.time() for i, j in enumerate(predict_features): if j in ['dia']: base_line_score[i] = gbdt_model(train, j, use_features, test_data, submission) print(dict(zip(predict_features, base_line_score))) print('CV训练用时{}秒'.format(time.time() - start)) print('scores:', np.mean(base_line_score)) sub_final = DataFrame(submission) sub_final.persist('dia_jz_5_fold_6_6_submit_22')
import numpy as np import sys reload(sys) sys.setdefaultencoding('utf8') label = 'tl' train = odps.get_table( '{}_juz_train_6_6_snp_onehot_22'.format(label)).to_df().to_pandas() test = odps.get_table( '{}_juz_test_6_6_snp_onehot_22'.format(label)).to_df().to_pandas() print(train.shape, test.shape) train['log_{}'.format(label)] = np.log(train[label]) test['log_{}'.format(label)] = np.log(test[label]) predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] for i in train.columns: if i != 'vid' and not 'snp' in i and not 'log' in i and i not in predict_features: train['jz_{}'.format(i)] = train[i] test['jz_{}'.format(i)] = test[i] predict_features.append(i) train.drop(predict_features, axis=1, inplace=True) test.drop(predict_features, axis=1, inplace=True) print(train.shape, test.shape) juz_train = DataFrame(train) juz_test = DataFrame(test) juz_train.persist('juz_train_6_7_xgb') juz_test.persist('juz_test_6_7_xgb')
from odps import ODPS import pandas as pd from odps.df import DataFrame from sklearn.model_selection import KFold train_data = odps.get_table('juz_train_6_7_xgb').to_df().to_pandas() kf = KFold(n_splits=5, shuffle=True, random_state=1024) for t, (train_index, test_index) in enumerate(kf.split(train_data), start=1): print('第{}次拆分...'.format(t)) x_train, x_test = train_data.iloc[train_index], train_data.iloc[test_index] print(x_train.shape, x_test.shape) train_odps = DataFrame(x_train) test_odps = DataFrame(x_test) train_odps.persist('tl_xgb_train_{}'.format(t)) test_odps.persist('tl_xgb_test_{}'.format(t))
class Test(TestBase): def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [['name1', 5], ['name2', 6]] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed_%d' % os.getpid()) if self.odps.exist_table(table): self.t = self.odps.get_table(table) else: self.t = self.odps.create_table(table, Schema.from_lists(names, types), lifecycle=1) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps) def teardown(self): self.engine._selecter.force_odps = False def testGroupReduction(self): expr = self.odps_df.select(self.odps_df, id2=self.odps_df.id.map(lambda x: x + 1)) expr = expr.groupby('name').id2.sum() expected = [['name1', 6], ['name2', 3]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result)) def assertPandasEqual(self, df1, df2): from odps.compat import six from odps import types as o_types from pandas.util.testing import assert_frame_equal # compare column types def get_odps_type(p_type): for data_type, builtin_type in six.iteritems( o_types._odps_primitive_to_builtin_types): if issubclass(p_type.type, builtin_type): return data_type types1 = [get_odps_type(dt) for dt in df1.dtypes] types2 = [get_odps_type(dt) for dt in df2.dtypes] self.assertSequenceEqual(types1, types2) assert_frame_equal(df1, df2, check_dtype=False) def testJoin(self): expr = self.odps_df.join(self.pd_df, 'name').sort('id_x') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected)) schema = Schema.from_lists( [c.name for c in self.t.schema.columns if c.name != 'name'], [c.type for c in self.t.schema.columns if c.name != 'name'], ['name'], ['string']) t = self.odps.create_table( 'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema) try: expr = self.odps_df.union(self.pd_df) expr.persist(t.name, create_table=False, partitions=['name']) self.assertEqual(self.engine.execute(DataFrame(t).count()), 5) self.engine._selecter.force_odps = False df = DataFrame(t) self.assertGreaterEqual( len( self.engine.execute(df.filter(df.name > 'a', df.name < 'b'))), 0) finally: t.drop() def testIsIn(self): expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df['name'].isin( self.pd_df['name']).rename('isin')).values self.assertTrue(result.equals(expected)) expr = (self.odps_df.id + 2).isin(self.pd_df['id']).rename('isin') res = self.engine.execute(expr) result = self._get_result(res) expected = [[False], [False], [True]] self.assertEqual(result, expected) def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) expr = expr[expr['name'].isin(self.pd_df['name'])] expr = expr[expr, func.rand(rtype='float').rename('rand')] result = self.engine.execute(expr).values[['name', 'id']] df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union( df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected)) def testPandasPersist(self): import pandas as pd, numpy as np tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) t = self.odps.create_table( tmp_table_name, ('a bigint, b bigint, c bigint', 'ds string')) t.create_partition('ds=today') try: pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name, partition='ds=today', odps=self.odps) self.assertPandasEqual(df[list('abc')].to_pandas(), pd_df) finally: self.odps.delete_table(tmp_table_name) self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist2') self.odps.delete_table(tmp_table_name, if_exists=True) try: pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) finally: self.odps.delete_table(tmp_table_name) def testExecuteCacheTable(self): df = self.odps_df[self.odps_df.name == 'name1'] result = df.execute().values self.assertEqual(len(result), 2) self.assertTrue(context.is_cached(df)) dag = self.engine.compile(df) calls = dag.topological_sort() self.assertEqual(len(calls), 1) self.assertTrue(is_source_collection(calls[0].expr)) df2 = df[:5] result = df2.execute() self.assertEqual(len(result), 2) def testHandleCache(self): df = self.pd_df['name', self.pd_df.id + 1] df.execute() self.assertTrue(context.is_cached(df)) df2 = df[df.id < 10] dag = self.engine.compile(df2) self.assertEqual(len(dag.nodes()), 1) self.assertTrue(is_source_collection(dag.nodes()[0].expr.input)) df3 = self.pd_df[self.pd_df.id < 10].count() i = df3.execute() self.assertTrue(context.is_cached(df3)) df4 = df3 + 1 dag = self.engine.compile(df4) self.assertEqual(len(dag.nodes()), 1) self.assertIsNotNone(dag.nodes()[0].expr._fields[0].lhs.value) self.assertEqual(df4.execute(), i + 1) def testCacheTable(self): self.engine._selecter.force_odps = True df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = self.engine.compile(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = context.get_cached(df) self.assertEqual(len(self.engine.execute(df)), len(expected)) self.assertIs(context.get_cached(df), table) if not isinstance(table, SeahawksTable): self.assertEqual(context.get_cached(df).lifecycle, 1) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(context.get_cached(df4), 2) def testUseCache(self): self.engine._selecter.force_odps = True df_cache = self.odps_df[self.odps_df['name'] == 'name1'].cache() df = df_cache[df_cache.id * 2, df_cache.exclude('id')] self.assertEqual(len(self.engine.execute(df, head=10)), 2) context.get_cached(df_cache).drop() self.assertEqual( len(self.engine.execute(df_cache['name', df_cache.id * 2], head=10)), 2) self.assertTrue(context.is_cached(df_cache)) self.assertTrue( self.odps.exist_table(context.get_cached(df_cache).name)) def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertTrue(context.is_cached(df)) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values['id'])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertTrue(context.is_cached(df)) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values['id'][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) class TunnelOnlyODPSEngine(ODPSSQLEngine): def _do_execute(self, *args, **kwargs): kwargs['_force_tunnel'] = True return super(TunnelOnlyODPSEngine, self)._do_execute(*args, **kwargs) engine = MixedEngine(self.odps) engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps) res = engine.execute(self.odps_df['id'], head=3) self.assertIsNotNone(res) self.assertEqual(sum(res.values['id']), 6) table_name = tn('pyodps_df_mixed2') self.odps.delete_table(table_name, if_exists=True) table = next(self.odps_df.data_source()) table2 = self.odps.create_table(table_name, table.schema) try: res = DataFrame(table2).head(10) self.assertEqual(len(res), 0) finally: table2.drop() def testMapReduceWithResource(self): pd_df2 = self.odps_df.to_pandas(wrap=True) @output(['name', 'id'], ['string', 'int']) def reducer(resources): d = dict() for r in resources[0]: if r.name in d: d[r.name] += r.id else: d[r.name] = r.id def inner(keys): def h(row, done): if row.name in d: d[row.name] += row.id else: d[row.name] = row.id if done: yield row.name, d[row.name] return h return inner expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) odps_df2 = self.pd_df.persist(tn('pyodps_df_mixed2'), odps=self.odps) try: expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) finally: next(odps_df2.data_source()).drop() def testBloomFilter(self): import numpy as np data2 = [['name1'], ['name3']] table_name = tn('pyodps_test_mixed_engine_bf_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['name'], ['string'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) try: expr = self.odps_df.bloom_filter('name', expr2[:1].name, capacity=10) res = self.engine.execute(expr) self.assertTrue(np.all(res['name'] != 'name2')) finally: table2.drop() def testCachePersist(self): expr = self.odps_df data2 = [['name1', 3.2], ['name3', 2.4]] table_name = tn('pyodps_test_mixed_engine_cp_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(['name', 'fid'], ['string', 'double'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache() output_table = tn('pyodps_test_mixed_engine_cp_output_table') self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string']) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition='ds=today', create_partition=True) self.assertEqual(len(t.execute()), 2) # test seahawks fallback self.assertEqual(t.input.count().execute(), 2) output_t.drop() def testBigintPartitionedCache(self): table = tn('pyodps_test_bigint_partitioned_cache') self.odps.delete_table(table, if_exists=True) expr = self.odps_df.persist(table, partitions=['id']) @output(['id', 'name'], ['int', 'string']) def handle(row): return row.id + 1, row.name expr = expr['tt' + expr.name, expr.id].cache() new_expr = expr.map_reduce(mapper=handle) res = self.engine.execute(new_expr) self.assertEqual(len(res), 3) def testAsync(self): expr = self.odps_df[self.odps_df.name == 'name1'] future = self.engine.execute(expr, async_=True) self.assertFalse(future.done()) res = future.result() self.assertEqual(len(res), 2) def testBatch(self): odps_expr = self.odps_df[self.odps_df.id < 4].cache() expr = odps_expr.join(self.pd_df, 'name').sort('id_x') dag = self.engine.compile(expr) self.assertEqual(len(dag.nodes()), 3) f = self.engine.execute(expr, async_=True, n_parallel=2) result = f.result().values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) def testBatchStop(self): self.engine._selecter.force_odps = True expr1 = self.odps_df[self.odps_df.id < 3].cache() expr2 = self.odps_df[self.odps_df.id > 3].cache() expr3 = expr1.union(expr2) self.engine.execute([expr1, expr2, expr3], n_parallel=2, async_=True) time.sleep(2) instance_ids = self.engine._odpssql_engine._instances self.assertEqual(len(instance_ids), 2) self.engine.stop() instances = [self.odps.get_instance(i) for i in instance_ids] [i.wait_for_completion() for i in instances] self.assertEqual( list(instances[0].get_task_statuses().values())[0].status, Instance.Task.TaskStatus.CANCELLED) self.assertEqual( list(instances[1].get_task_statuses().values())[0].status, Instance.Task.TaskStatus.CANCELLED) def testFailure(self): from odps.df.backends.errors import DagDependencyError expr1 = self.odps_df[self.odps_df.id / 0 < 0].cache() expr2 = expr1.count() fs = self.engine.execute(expr2, async_=True) self.assertRaises(DagDependencyError, fs.result) def testAppendIDCache(self): options.ml.dry_run = False @output(['id1'] + self.odps_df.schema.names, ['int'] + self.odps_df.schema.types) def h(row): yield row expr1 = self.odps_df.append_id(id_col='id1').apply(h, axis=1) expr2 = self.odps_df.append_id(id_col='id2') expr3 = expr1.join(expr2, on='id')['id1', 'id2'] self.assertEqual(len(expr3.execute()), 3) def testAppendId(self): options.ml.dry_run = False expr = self.odps_df['name', ].distinct() expr = expr.append_id(id_col='id2') expr = expr.join(self.odps_df, on=['name']) tablename = tn('pyodps_test_append_id_persist') self.odps.delete_table(tablename, if_exists=True) expr.persist(tablename, partitions=['name'], lifecycle=1) def testHorzConcat(self): options.ml.dry_run = False table_name = tn('test_horz_concat_table2_xxx_yyy') self.odps.delete_table(table_name, if_exists=True) result_table_name = tn('test_horz_concat_result') self.odps.delete_table(result_table_name, if_exists=True) self.odps_df[self.odps_df.name, (self.odps_df.id * 2).rename('ren_id')].persist(table_name) df2 = self.odps.get_table(table_name).to_df() df2 = df2[:3] expr = self.odps_df.concat(df2.ren_id, axis=1) expr.persist(result_table_name, lifecycle=1) def testAsTypeMapReduce(self): expr = self.odps_df[self.odps_df.exclude('id'), self.odps_df.id.astype('float')] expr = expr.filter(expr.id < 10)['id', 'name'] @output(['id', 'name'], ['float', 'string']) def h(group): def inn(row, done): yield row return inn expr = expr.map_reduce(reducer=h) expr.execute() expr = self.odps_df[self.odps_df.exclude('id'), self.odps_df.id.astype('float')] expr = expr.filter(expr.id < 10).distinct('id', 'name') @output(['id', 'name'], ['float', 'string']) def h(group): def inn(row, done): yield row return inn expr = expr.map_reduce(reducer=h) expr.execute()
class Test(TestBase): def setup(self): import pandas as pd odps_data = [["name1", 1], ["name2", 2], ["name1", 3]] pd_data = [["name1", 5], ["name2", 6]] names = ["name", "id"] types = ["string", "bigint"] table = tn("pyodps_df_mixed") self.odps.delete_table(table, if_exists=True) self.t = self.odps.create_table(table, Schema.from_lists(names, types)) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps) def teardown(self): self.t.drop() def testGroupReduction(self): expr = self.odps_df.select(self.odps_df, id2=self.odps_df.id.map(lambda x: x + 1)) expr = expr.groupby("name").id2.sum() expected = [["name1", 6], ["name2", 3]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result)) def assertPandasEqual(self, df1, df2): from odps.compat import six from odps import types as o_types from pandas.util.testing import assert_frame_equal # compare column types def get_odps_type(p_type): for data_type, builtin_type in six.iteritems(o_types._odps_primitive_to_builtin_types): if issubclass(p_type.type, builtin_type): return data_type types1 = [get_odps_type(dt) for dt in df1.dtypes] types2 = [get_odps_type(dt) for dt in df2.dtypes] self.assertSequenceEqual(types1, types2) assert_frame_equal(df1, df2, check_dtype=False) def testJoin(self): expr = self.odps_df.join(self.pd_df, "name").sort("id_x") result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.join(self.pd_df, "name").sort("id_x")).values self.assertTrue(result.equals(expected)) def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(["id", "name"]) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df.union(self.pd_df).sort(["id", "name"])).values self.assertTrue(result.equals(expected)) def testIsIn(self): expr = self.odps_df["name"].isin(self.pd_df["name"]).rename("isin") result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df["name"].isin(self.pd_df["name"]).rename("isin")).values self.assertTrue(result.equals(expected)) def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")] ).sort(["name", "id"]) expr = expr[expr["name"].isin(self.pd_df["name"])] result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union(df.join(self.pd_df, "name")[lambda x: x.name, lambda x: x.id_x.rename("id")]).sort( ["name", "id"] ) test_expr = test_expr[test_expr["name"].isin(self.pd_df["name"])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected)) def testPandasPersist(self): import pandas as pd, numpy as np self.odps.to_global() tmp_table_name = tn("pyodps_test_mixed_persist") self.odps.delete_table(tmp_table_name, if_exists=True) pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list("abc")) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) self.odps.delete_table(tmp_table_name) def testExecuteCacheTable(self): df = self.odps_df[self.odps_df.name == "name1"] result = df.execute().values self.assertEqual(len(result), 2) self.assertIsNotNone(df._cache_data) _, new_df, cbs = self.engine._compile(df) try: self.assertIsNotNone(new_df._source_data) finally: [cb() for cb in cbs] df2 = df[:5] result = df2.execute() self.assertEqual(len(result), 2) def testHandleCache(self): df = self.pd_df["name", self.pd_df.id + 1] df.execute() self.assertIsNotNone(df._cache_data) df2 = df[df.id < 10] _, new_df2, cbs = self.engine._compile(df2) try: self.assertIsNotNone(new_df2.input._source_data) finally: [cb() for cb in cbs] def testCacheTable(self): df = self.odps_df.join(self.pd_df, "name").cache() df2 = df.sort("id_x") dag = self.engine._compile_dag(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df3.join(self.pd_df, "name").sort("id_x")).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = df._cache_data self.assertEqual(len(df.execute()), len(expected)) self.assertIs(df._cache_data, table) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(df4._cache_data, 2) def testUseCache(self): df = self.odps_df[self.odps_df["name"] == "name1"] self.assertEqual(len(df.head(10)), 2) df._cache_data.drop() self.assertRaises(ODPSError, lambda: self.engine.execute(df["name", "id"])) def plot(**_): pass self.assertRaises(ODPSError, lambda: df.plot(x="id", plot_func=plot)) def testPivot(self): data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]] table_name = tn("pyodps_test_mixed_engine_pivot") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct() res = self.engine.execute(expr1) result = self._get_result(res) expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]] self.assertEqual(sorted(result), sorted(expected)) expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"] with self.assertRaises(ValueError) as cm: self.engine.execute(expr3) self.assertIn("name3", str(cm.exception)) expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"] res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 1.0], [2, 2.0], [3, None]] self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot(rows="id", columns="name", values="fid") expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")] res = self.engine.execute(expr5) result = self._get_result(res) expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]] self.assertEqual(sorted(result), sorted(expected)) expr6 = expr.pivot(rows="id", columns="name", values="fid") expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"] res = self.engine.execute(expr6) result = self._get_result(res) expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]] self.assertEqual(sorted(result), sorted(expected)) finally: table.drop() def testPivotTable(self): data = [ ["name1", 1, 1.0, True], ["name1", 1, 5.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False], ] table_name = tn("pyodps_test_mixed_engine_pivot_table") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]), ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.pivot_table(rows="name", values="fid") res = self.engine.execute(expr1) result = self._get_result(res) expected = [["name1", 8.0 / 3], ["name2", 3.5]] self.assertEqual(sorted(result), sorted(expected)) expr2 = expr.pivot_table(rows="name", values="fid", aggfunc=["mean", "sum"]) res = self.engine.execute(expr2) result = self._get_result(res) expected = [["name1", 8.0 / 3, 8.0], ["name2", 3.5, 7.0]] self.assertEqual(res.schema.names, ["name", "fid_mean", "fid_sum"]) self.assertEqual(sorted(result), sorted(expected)) expr5 = expr.pivot_table(rows="id", values="fid", columns="name", aggfunc=["mean", "sum"]) expr6 = expr5[ "name1_fid_mean", expr5.groupby(Scalar(1)).sort("name1_fid_mean").name1_fid_mean.astype("float").cumsum(), ] k = lambda x: list(0 if it is None else it for it in x) # TODO: fix this situation, act different compared to pandas expected = [[2, 2], [3, 5], [None, None]] res = self.engine.execute(expr6) result = self._get_result(res) self.assertEqual(sorted(result, key=k), sorted(expected, key=k)) expr3 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0).distinct() res = self.engine.execute(expr3) result = self._get_result(res) expected = [[1, 3.0, 3.0], [2, 2.0, 0], [3, 0, 4.0]] self.assertEqual(res.schema.names, ["id", "name1_fid_mean", "name2_fid_mean"]) self.assertEqual(result, expected) class Agg(object): def buffer(self): return [0] def __call__(self, buffer, val): buffer[0] += val def merge(self, buffer, pbuffer): buffer[0] += pbuffer[0] def getvalue(self, buffer): return buffer[0] aggfuncs = OrderedDict([("my_sum", Agg), ("mean", "mean")]) expr4 = expr.pivot_table(rows="id", values="fid", columns="name", fill_value=0, aggfunc=aggfuncs) res = self.engine.execute(expr4) result = self._get_result(res) expected = [[1, 6.0, 3.0, 3.0, 3.0], [2, 2.0, 0, 2.0, 0], [3, 0, 4.0, 0, 4.0]] self.assertEqual( res.schema.names, ["id", "name1_fid_my_sum", "name2_fid_my_sum", "name1_fid_mean", "name2_fid_mean"] ) self.assertEqual(result, expected) finally: table.drop() def testExtractKV(self): data = [ ["name1", "k1=1,k2=3,k5=10", "1=5,3=7,2=1"], ["name1", "", "3=1,4=2"], ["name1", "k1=7.1,k7=8.2", "1=1,5=6"], ["name2", "k2=1.2,k3=1.5", None], ["name2", "k9=1.1,k2=1", "4=2"], ] table_name = tn("pyodps_test_mixed_engine_extract_kv") self.odps.delete_table(table_name, if_exists=True) table = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "kv", "kv2"], ["string", "string", "string"]) ) expr = DataFrame(table) try: self.odps.write_table(table, 0, data) expr1 = expr.extract_kv(columns=["kv", "kv2"], kv_delim="=") res = self.engine.execute(expr1) result = self._get_result(res) expected_cols = [ "name", "kv_k1", "kv_k2", "kv_k3", "kv_k5", "kv_k7", "kv_k9", "kv2_1", "kv2_2", "kv2_3", "kv2_4", "kv2_5", ] expected = [ ["name1", 1.0, 3.0, None, 10.0, None, None, 5.0, 1.0, 7.0, None, None], ["name1", None, None, None, None, None, None, None, None, 1.0, 2.0, None], ["name1", 7.1, None, None, None, 8.2, None, 1.0, None, None, None, 6.0], ["name2", None, 1.2, 1.5, None, None, None, None, None, None, None, None], ["name2", None, 1.0, None, None, None, 1.1, None, None, None, 2.0, None], ] self.assertListEqual([c.name for c in res.columns], expected_cols) self.assertEqual(result, expected) finally: table.drop() def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df["name"] == "name1"] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values["id"])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df["name"] == "name1"] res = df.head(1) self.assertEqual(len(res), 1) self.assertIsNotNone(df._cache_data) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values["id"][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) class TunnelOnlyODPSEngine(ODPSEngine): def execute(self, expr, **kw): expr = self._pre_process(expr) head = kw.get("head") return self._handle_cases(expr, head=head) engine = MixedEngine(self.odps) engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps, global_optimize=False) res = engine.execute(self.odps_df["id"], head=3) self.assertIsNotNone(res) self.assertEqual(sum(res.values["id"]), 6) table_name = tn("pyodps_df_mixed2") self.odps.delete_table(table_name, if_exists=True) table = next(self.odps_df.data_source()) table2 = self.odps.create_table(table_name, table.schema) try: res = DataFrame(table2).head(10) self.assertEqual(len(res), 0) finally: table2.drop() def testMapReduceWithResource(self): pd_df2 = self.odps_df.to_pandas(wrap=True) @output(["name", "id"], ["string", "int"]) def reducer(resources): d = dict() for r in resources[0]: if r.name in d: d[r.name] += r.id else: d[r.name] = r.id def inner(keys): def h(row, done): if row.name in d: d[row.name] += row.id else: d[row.name] = row.id if done: yield row.name, d[row.name] return h return inner expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) odps_df2 = self.pd_df.persist(tn("pyodps_df_mixed2"), odps=self.odps) try: expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group="name") result = expr.execute() self.assertEqual(result.values["id"].sum(), 17) finally: next(odps_df2.data_source()).drop() def testBloomFilter(self): import numpy as np data2 = [["name1"], ["name3"]] table_name = tn("pyodps_test_mixed_engine_bf_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(["name"], ["string"])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) try: expr = self.odps_df.bloom_filter("name", expr2[:1].name, capacity=10) res = self.engine.execute(expr) self.assertTrue(np.all(res["name"] != "name2")) finally: table2.drop() def testCachePersist(self): expr = self.odps_df data2 = [["name1", 3.2], ["name3", 2.4]] table_name = tn("pyodps_test_mixed_engine_cp_table2") self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"]) ) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache() output_table = tn("pyodps_test_mixed_engine_cp_output_table") self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"]) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition="ds=today", create_partition=True) self.assertEqual(len(t.execute()), 2) output_t.drop() def testBigintPartitionedCache(self): table = tn("pyodps_test_bigint_partitioned_cache") self.odps.delete_table(table, if_exists=True) expr = self.odps_df.persist(table, partitions=["id"]) @output(["id", "name"], ["int", "string"]) def handle(row): return row.id + 1, row.name expr = expr["tt" + expr.name, expr.id].cache() new_expr = expr.map_reduce(mapper=handle) res = self.engine.execute(new_expr) self.assertEqual(len(res), 3)