def testSimplePersist(self): self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)) df.append_id().persist(SIMPLE_PERSIST_TABLE, lifecycle=1, drop_table=True) self.assertTrue(self.odps.exist_table(SIMPLE_PERSIST_TABLE))
def testDynamicPartition(self): self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)) id_df = df.append_id() self.odps.delete_table(DYNAMIC_PART_TABLE, if_exists=True) id_df.persist(DYNAMIC_PART_TABLE, partitions='category', lifecycle=1, drop_table=True) self.assertTrue(self.odps.exist_table(DYNAMIC_PART_TABLE)) t = self.odps.get_table(DYNAMIC_PART_TABLE) self.assertIn('category', [pt.name for pt in t.schema.partitions])
def testStaticPartition(self): self.create_iris(IRIS_TABLE) df = DataFrame(self.odps.get_table(IRIS_TABLE)) id_df = df.append_id() src_schema = df_schema_to_odps_schema(id_df.schema) schema = Schema(columns=src_schema.simple_columns, partitions=[Partition(name='ds', type=odps_types.string)]) self.odps.delete_table(STATIC_PART_TABLE, if_exists=True) dest_table = self.odps.create_table(STATIC_PART_TABLE, schema, lifecycle=1) id_df.persist(STATIC_PART_TABLE, partition='ds=20170314', lifecycle=1) self.assertTrue(dest_table.exist_partition('ds=20170314'))
class Test(TestBase): def setup(self): import pandas as pd odps_data = [ ['name1', 1], ['name2', 2], ['name1', 3], ] pd_data = [['name1', 5], ['name2', 6]] names = ['name', 'id'] types = ['string', 'bigint'] table = tn('pyodps_df_mixed_%d' % os.getpid()) if self.odps.exist_table(table): self.t = self.odps.get_table(table) else: self.t = self.odps.create_table(table, Schema.from_lists(names, types), lifecycle=1) with self.t.open_writer() as w: w.write([self.t.new_record(r) for r in odps_data]) self.odps_df = DataFrame(self.t) self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names)) self.engine = MixedEngine(self.odps) self.pd_engine = PandasEngine(self.odps) def teardown(self): self.engine._selecter.force_odps = False def testGroupReduction(self): expr = self.odps_df.select(self.odps_df, id2=self.odps_df.id.map(lambda x: x + 1)) expr = expr.groupby('name').id2.sum() expected = [['name1', 6], ['name2', 3]] res = self.engine.execute(expr) result = self._get_result(res) self.assertEqual(sorted([[r[1]] for r in expected]), sorted(result)) def assertPandasEqual(self, df1, df2): from odps.compat import six from odps import types as o_types from pandas.util.testing import assert_frame_equal # compare column types def get_odps_type(p_type): for data_type, builtin_type in six.iteritems( o_types._odps_primitive_to_builtin_types): if issubclass(p_type.type, builtin_type): return data_type types1 = [get_odps_type(dt) for dt in df1.dtypes] types2 = [get_odps_type(dt) for dt in df2.dtypes] self.assertSequenceEqual(types1, types2) assert_frame_equal(df1, df2, check_dtype=False) def testJoin(self): expr = self.odps_df.join(self.pd_df, 'name').sort('id_x') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) def testUnion(self): expr = self.odps_df.union(self.pd_df).sort(['id', 'name']) result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.union(self.pd_df).sort(['id', 'name'])).values self.assertTrue(result.equals(expected)) schema = Schema.from_lists( [c.name for c in self.t.schema.columns if c.name != 'name'], [c.type for c in self.t.schema.columns if c.name != 'name'], ['name'], ['string']) t = self.odps.create_table( 'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema) try: expr = self.odps_df.union(self.pd_df) expr.persist(t.name, create_table=False, partitions=['name']) self.assertEqual(self.engine.execute(DataFrame(t).count()), 5) self.engine._selecter.force_odps = False df = DataFrame(t) self.assertGreaterEqual( len( self.engine.execute(df.filter(df.name > 'a', df.name < 'b'))), 0) finally: t.drop() def testIsIn(self): expr = self.odps_df['name'].isin(self.pd_df['name']).rename('isin') result = self.engine.execute(expr).values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute(df['name'].isin( self.pd_df['name']).rename('isin')).values self.assertTrue(result.equals(expected)) expr = (self.odps_df.id + 2).isin(self.pd_df['id']).rename('isin') res = self.engine.execute(expr) result = self._get_result(res) expected = [[False], [False], [True]] self.assertEqual(result, expected) def testMixed(self): expr = self.odps_df.union( self.odps_df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) expr = expr[expr['name'].isin(self.pd_df['name'])] expr = expr[expr, func.rand(rtype='float').rename('rand')] result = self.engine.execute(expr).values[['name', 'id']] df = DataFrame(self.odps_df.to_pandas()) test_expr = df.union( df.join(self.pd_df, 'name')[lambda x: x.name, lambda x: x.id_x.rename('id')]).sort( ['name', 'id']) test_expr = test_expr[test_expr['name'].isin(self.pd_df['name'])] expected = self.pd_engine.execute(test_expr).values self.assertTrue(result.equals(expected)) def testPandasPersist(self): import pandas as pd, numpy as np tmp_table_name = tn('pyodps_test_mixed_persist') self.odps.delete_table(tmp_table_name, if_exists=True) t = self.odps.create_table( tmp_table_name, ('a bigint, b bigint, c bigint', 'ds string')) t.create_partition('ds=today') try: pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name, partition='ds=today', odps=self.odps) self.assertPandasEqual(df[list('abc')].to_pandas(), pd_df) finally: self.odps.delete_table(tmp_table_name) self.odps.to_global() tmp_table_name = tn('pyodps_test_mixed_persist2') self.odps.delete_table(tmp_table_name, if_exists=True) try: pd_df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=list('abc')) df = DataFrame(pd_df).persist(tmp_table_name) self.assertPandasEqual(df.to_pandas(), pd_df) finally: self.odps.delete_table(tmp_table_name) def testExecuteCacheTable(self): df = self.odps_df[self.odps_df.name == 'name1'] result = df.execute().values self.assertEqual(len(result), 2) self.assertTrue(context.is_cached(df)) dag = self.engine.compile(df) calls = dag.topological_sort() self.assertEqual(len(calls), 1) self.assertTrue(is_source_collection(calls[0].expr)) df2 = df[:5] result = df2.execute() self.assertEqual(len(result), 2) def testHandleCache(self): df = self.pd_df['name', self.pd_df.id + 1] df.execute() self.assertTrue(context.is_cached(df)) df2 = df[df.id < 10] dag = self.engine.compile(df2) self.assertEqual(len(dag.nodes()), 1) self.assertTrue(is_source_collection(dag.nodes()[0].expr.input)) df3 = self.pd_df[self.pd_df.id < 10].count() i = df3.execute() self.assertTrue(context.is_cached(df3)) df4 = df3 + 1 dag = self.engine.compile(df4) self.assertEqual(len(dag.nodes()), 1) self.assertIsNotNone(dag.nodes()[0].expr._fields[0].lhs.value) self.assertEqual(df4.execute(), i + 1) def testCacheTable(self): self.engine._selecter.force_odps = True df = self.odps_df.join(self.pd_df, 'name').cache() df2 = df.sort('id_x') dag = self.engine.compile(df2) self.assertEqual(len(dag.nodes()), 3) result = self.engine.execute(df2).values df3 = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df3.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) self.assertEqual(len(self.engine._generated_table_names), 2) table = context.get_cached(df) self.assertEqual(len(self.engine.execute(df)), len(expected)) self.assertIs(context.get_cached(df), table) if not isinstance(table, SeahawksTable): self.assertEqual(context.get_cached(df).lifecycle, 1) df4 = df[df.id_x < 3].count() result = self.engine.execute(df4) self.assertEqual(result, 2) self.assertEqual(context.get_cached(df4), 2) def testUseCache(self): self.engine._selecter.force_odps = True df_cache = self.odps_df[self.odps_df['name'] == 'name1'].cache() df = df_cache[df_cache.id * 2, df_cache.exclude('id')] self.assertEqual(len(self.engine.execute(df, head=10)), 2) context.get_cached(df_cache).drop() self.assertEqual( len(self.engine.execute(df_cache['name', df_cache.id * 2], head=10)), 2) self.assertTrue(context.is_cached(df_cache)) self.assertTrue( self.odps.exist_table(context.get_cached(df_cache).name)) def testHeadAndTail(self): res = self.odps_df.head(2) self.assertEqual(len(res), 2) df = self.odps_df[self.odps_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertTrue(context.is_cached(df)) res = self.odps_df.tail(2) self.assertEqual(len(res), 2) self.assertTrue(all(it > 1 for it in res.values['id'])) self.assertEqual(len(self.odps_df.name.head(2)), 2) self.assertEqual(len(self.odps_df.name.tail(2)), 2) res = self.pd_df.head(1) self.assertEqual(len(res), 1) df = self.pd_df[self.pd_df['name'] == 'name1'] res = df.head(1) self.assertEqual(len(res), 1) self.assertTrue(context.is_cached(df)) res = self.pd_df.tail(1) self.assertEqual(len(res), 1) self.assertEqual(res.values['id'][0], 6) self.assertEqual(len(self.pd_df.name.head(1)), 1) self.assertEqual(len(self.pd_df.name.tail(1)), 1) class TunnelOnlyODPSEngine(ODPSSQLEngine): def _do_execute(self, *args, **kwargs): kwargs['_force_tunnel'] = True return super(TunnelOnlyODPSEngine, self)._do_execute(*args, **kwargs) engine = MixedEngine(self.odps) engine._odpssql_engine = TunnelOnlyODPSEngine(self.odps) res = engine.execute(self.odps_df['id'], head=3) self.assertIsNotNone(res) self.assertEqual(sum(res.values['id']), 6) table_name = tn('pyodps_df_mixed2') self.odps.delete_table(table_name, if_exists=True) table = next(self.odps_df.data_source()) table2 = self.odps.create_table(table_name, table.schema) try: res = DataFrame(table2).head(10) self.assertEqual(len(res), 0) finally: table2.drop() def testMapReduceWithResource(self): pd_df2 = self.odps_df.to_pandas(wrap=True) @output(['name', 'id'], ['string', 'int']) def reducer(resources): d = dict() for r in resources[0]: if r.name in d: d[r.name] += r.id else: d[r.name] = r.id def inner(keys): def h(row, done): if row.name in d: d[row.name] += row.id else: d[row.name] = row.id if done: yield row.name, d[row.name] return h return inner expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) odps_df2 = self.pd_df.persist(tn('pyodps_df_mixed2'), odps=self.odps) try: expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = self.odps_df.map_reduce(reducer=reducer, reducer_resources=[self.pd_df], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) expr = pd_df2.map_reduce(reducer=reducer, reducer_resources=[odps_df2], group='name') result = expr.execute() self.assertEqual(result.values['id'].sum(), 17) finally: next(odps_df2.data_source()).drop() def testBloomFilter(self): import numpy as np data2 = [['name1'], ['name3']] table_name = tn('pyodps_test_mixed_engine_bf_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table(name=table_name, schema=Schema.from_lists(['name'], ['string'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) try: expr = self.odps_df.bloom_filter('name', expr2[:1].name, capacity=10) res = self.engine.execute(expr) self.assertTrue(np.all(res['name'] != 'name2')) finally: table2.drop() def testCachePersist(self): expr = self.odps_df data2 = [['name1', 3.2], ['name3', 2.4]] table_name = tn('pyodps_test_mixed_engine_cp_table2') self.odps.delete_table(table_name, if_exists=True) table2 = self.odps.create_table( name=table_name, schema=Schema.from_lists(['name', 'fid'], ['string', 'double'])) expr2 = DataFrame(table2) self.odps.write_table(table2, 0, data2) @output(expr.schema.names, expr.schema.types) def h(row): yield row l = expr.filter(expr.id > 0).apply(h, axis=1).cache() r = expr2.filter(expr2.fid > 0) joined = l.join(r, on=['name', r.fid < 4])['id', 'fid'].cache() output_table = tn('pyodps_test_mixed_engine_cp_output_table') self.odps.delete_table(output_table, if_exists=True) schema = Schema.from_lists(['id', 'fid'], ['bigint', 'double'], ['ds'], ['string']) output_t = self.odps.create_table(output_table, schema, if_not_exists=True) t = joined.persist(output_table, partition='ds=today', create_partition=True) self.assertEqual(len(t.execute()), 2) # test seahawks fallback self.assertEqual(t.input.count().execute(), 2) output_t.drop() def testBigintPartitionedCache(self): table = tn('pyodps_test_bigint_partitioned_cache') self.odps.delete_table(table, if_exists=True) expr = self.odps_df.persist(table, partitions=['id']) @output(['id', 'name'], ['int', 'string']) def handle(row): return row.id + 1, row.name expr = expr['tt' + expr.name, expr.id].cache() new_expr = expr.map_reduce(mapper=handle) res = self.engine.execute(new_expr) self.assertEqual(len(res), 3) def testAsync(self): expr = self.odps_df[self.odps_df.name == 'name1'] future = self.engine.execute(expr, async_=True) self.assertFalse(future.done()) res = future.result() self.assertEqual(len(res), 2) def testBatch(self): odps_expr = self.odps_df[self.odps_df.id < 4].cache() expr = odps_expr.join(self.pd_df, 'name').sort('id_x') dag = self.engine.compile(expr) self.assertEqual(len(dag.nodes()), 3) f = self.engine.execute(expr, async_=True, n_parallel=2) result = f.result().values df = DataFrame(self.odps_df.to_pandas()) expected = self.pd_engine.execute( df.join(self.pd_df, 'name').sort('id_x')).values self.assertTrue(result.equals(expected)) def testBatchStop(self): self.engine._selecter.force_odps = True expr1 = self.odps_df[self.odps_df.id < 3].cache() expr2 = self.odps_df[self.odps_df.id > 3].cache() expr3 = expr1.union(expr2) self.engine.execute([expr1, expr2, expr3], n_parallel=2, async_=True) time.sleep(2) instance_ids = self.engine._odpssql_engine._instances self.assertEqual(len(instance_ids), 2) self.engine.stop() instances = [self.odps.get_instance(i) for i in instance_ids] [i.wait_for_completion() for i in instances] self.assertEqual( list(instances[0].get_task_statuses().values())[0].status, Instance.Task.TaskStatus.CANCELLED) self.assertEqual( list(instances[1].get_task_statuses().values())[0].status, Instance.Task.TaskStatus.CANCELLED) def testFailure(self): from odps.df.backends.errors import DagDependencyError expr1 = self.odps_df[self.odps_df.id / 0 < 0].cache() expr2 = expr1.count() fs = self.engine.execute(expr2, async_=True) self.assertRaises(DagDependencyError, fs.result) def testAppendIDCache(self): options.ml.dry_run = False @output(['id1'] + self.odps_df.schema.names, ['int'] + self.odps_df.schema.types) def h(row): yield row expr1 = self.odps_df.append_id(id_col='id1').apply(h, axis=1) expr2 = self.odps_df.append_id(id_col='id2') expr3 = expr1.join(expr2, on='id')['id1', 'id2'] self.assertEqual(len(expr3.execute()), 3) def testAppendId(self): options.ml.dry_run = False expr = self.odps_df['name', ].distinct() expr = expr.append_id(id_col='id2') expr = expr.join(self.odps_df, on=['name']) tablename = tn('pyodps_test_append_id_persist') self.odps.delete_table(tablename, if_exists=True) expr.persist(tablename, partitions=['name'], lifecycle=1) def testHorzConcat(self): options.ml.dry_run = False table_name = tn('test_horz_concat_table2_xxx_yyy') self.odps.delete_table(table_name, if_exists=True) result_table_name = tn('test_horz_concat_result') self.odps.delete_table(result_table_name, if_exists=True) self.odps_df[self.odps_df.name, (self.odps_df.id * 2).rename('ren_id')].persist(table_name) df2 = self.odps.get_table(table_name).to_df() df2 = df2[:3] expr = self.odps_df.concat(df2.ren_id, axis=1) expr.persist(result_table_name, lifecycle=1) def testAsTypeMapReduce(self): expr = self.odps_df[self.odps_df.exclude('id'), self.odps_df.id.astype('float')] expr = expr.filter(expr.id < 10)['id', 'name'] @output(['id', 'name'], ['float', 'string']) def h(group): def inn(row, done): yield row return inn expr = expr.map_reduce(reducer=h) expr.execute() expr = self.odps_df[self.odps_df.exclude('id'), self.odps_df.id.astype('float')] expr = expr.filter(expr.id < 10).distinct('id', 'name') @output(['id', 'name'], ['float', 'string']) def h(group): def inn(row, done): yield row return inn expr = expr.map_reduce(reducer=h) expr.execute()
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_iris(IRIS_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_TABLE)) def testCollectionLabelling(self): # select_features self.assertRaises(ValueError, lambda: self.df.select_features()) df2 = self.df.select_features('sepal_length sepal_width petal_length') self.assertEqual( _df_roles(df2), dict(category='', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='')) df3 = df2.select_features('petal_width', add=True) self.assertEqual( _df_roles(df3), dict(category='', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # exclude_fields self.assertRaises(ValueError, lambda: self.df.exclude_fields()) df4 = df3.exclude_fields('sepal_length sepal_width') self.assertEqual( _df_roles(df4), dict(category='', sepal_width='', sepal_length='', petal_length='FEATURE', petal_width='FEATURE')) # weight_field self.assertRaises(ValueError, lambda: self.df.weight_field(None)) df5 = df3.weight_field('sepal_width') self.assertEqual( _df_roles(df5), dict(category='', sepal_width='WEIGHT', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # label_field self.assertRaises(ValueError, lambda: self.df.label_field(None)) df6 = self.df.label_field('category') self.assertEqual( _df_roles(df6), dict(category='LABEL', sepal_width='FEATURE', sepal_length='FEATURE', petal_length='FEATURE', petal_width='FEATURE')) # roles self.assertIs(self.df, self.df.roles()) df7 = self.df.roles(label='category', weight='sepal_width') self.assertEqual( _df_roles(df7), dict(category='LABEL', petal_length='FEATURE', petal_width='FEATURE', sepal_width='WEIGHT', sepal_length='FEATURE')) # discrete df8 = self.df.discrete('sepal_width, sepal_length') self.assertEqual( _df_continuity(df8), dict(category='DISCRETE', sepal_width='DISCRETE', sepal_length='DISCRETE', petal_length='CONTINUOUS', petal_width='CONTINUOUS')) # continuous df9 = df8.continuous('sepal_width') self.assertEqual( _df_continuity(df9), dict(category='DISCRETE', sepal_width='CONTINUOUS', sepal_length='DISCRETE', petal_length='CONTINUOUS', petal_width='CONTINUOUS')) # key_value df10 = self.df.key_value('sepal_length sepal_width') self.assertEqual( _df_key_value(df10), dict(category='', petal_length='', petal_width='', sepal_width='KVConfig(kv=:, item=,)', sepal_length='KVConfig(kv=:, item=,)')) df11 = df10.key_value('sepal_length', kv='-', item=';') self.assertEqual( _df_key_value(df11), dict(category='', petal_length='', petal_width='', sepal_width='KVConfig(kv=:, item=,)', sepal_length='KVConfig(kv=-, item=;)')) # erase_key_value df12 = df10.erase_key_value('sepal_width') self.assertEqual( _df_key_value(df12), dict(category='', petal_length='', petal_width='', sepal_width='', sepal_length='KVConfig(kv=:, item=,)')) def testSeqFieldOperations(self): seq = self.df.sepal_length # roles seq1 = seq.role('weight') self.assertEqual(_df_roles(seq1), dict(sepal_length='WEIGHT')) # discrete seq2 = seq.discrete() self.assertEqual(_df_continuity(seq2), dict(sepal_length='DISCRETE')) # continuous seq3 = seq.continuous() self.assertEqual(_df_continuity(seq3), dict(sepal_length='CONTINUOUS')) # key_value seq4 = seq.key_value() self.assertEqual(_df_key_value(seq4), dict(sepal_length='KVConfig(kv=:, item=,)')) seq5 = seq4.key_value(kv='-', item=';') self.assertEqual(_df_key_value(seq5), dict(sepal_length='KVConfig(kv=-, item=;)')) # erase_key_value seq6 = seq5.erase_key_value() self.assertEqual(_df_key_value(seq6), dict(sepal_length='')) def testCollectionOperations(self): splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) self.assertEqual(splited[0]._algo, 'Split') self.assertEqual(splited[0]._params['fraction'], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict(category='FEATURE', petal_length='FEATURE', petal_width='FEATURE', sepal_width='FEATURE', sepal_length='FEATURE', append_id='')) self.assertEqual(id_appended._algo, 'AppendID') self.assertEqual(id_appended._params['IDColName'], 'append_id') def testDTypes(self): rstrip_lines = lambda s: '\n'.join(l.rstrip() for l in s.splitlines()) old_dtypes_repr = rstrip_lines( textwrap.dedent(""" odps.Schema { sepal_length float64 sepal_width float64 petal_length float64 petal_width float64 category string } """)).strip() self.assertEqual( rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr) new_df = self.df.roles(label='category').key_value('sepal_length') new_dtypes_repr = rstrip_lines( textwrap.dedent(""" odps.Schema { sepal_length KV(':', ',') FEATURE sepal_width float64 FEATURE petal_length float64 FEATURE petal_width float64 FEATURE category string LABEL } """)).strip() self.assertEqual( rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr) def testMerge(self): from odps.ml.expr.mixin import merge_data self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql( 'create table {0} (col11 string, col12 string) lifecycle 1'.format( TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql( 'create table {0} (col21 string, col22 string) lifecycle 1'.format( TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual( _df_roles(merged1), dict(col21='FEATURE', col11='FEATURE', col12='FEATURE', col22='FEATURE')) merged2 = merge_data((df1, 'col11'), (df2, 'col21', True)) self.assertEqual(_df_roles(merged2), dict(col11='FEATURE', col22='FEATURE')) merged3 = merge_data((df1, 'col11'), (df2, 'col21', True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11='FEATURE', t1_col22='FEATURE')) merged4 = df1.merge_with(df2) self.assertEqual( _df_roles(merged4), dict(col21='FEATURE', col11='FEATURE', col12='FEATURE', col22='FEATURE')) options.ml.dry_run = True merged4._add_case( self.gen_check_params_case({ 'outputTableName': 'merged_table', 'inputTableNames': TEMP_TABLE_1_NAME + ',' + TEMP_TABLE_2_NAME, 'inputPartitionsInfoList': ',', 'selectedColNamesList': 'col11,col12;col21,col22' })) merged4.persist('merged_table') def testSampleClass(self): from ..core import AlgoExprMixin num_sampled = self.df.sample(n=20) self.assertIsInstance(num_sampled, AlgoExprMixin) self.assertEqual(num_sampled._algo, 'RandomSample') frac_sampled = self.df.sample(frac=0.5) self.assertIsInstance(frac_sampled, AlgoExprMixin) self.assertEqual(frac_sampled._algo, 'RandomSample') weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) self.assertIsInstance(weighted_sampled, AlgoExprMixin) self.assertEqual(weighted_sampled._algo, 'WeightedSample') self.assertEqual(weighted_sampled._params['probCol'], 'sepal_length') stratified_sampled = self.df.sample(frac={'Iris-setosa': 0.5}, strata='category') self.assertIsInstance(stratified_sampled, AlgoExprMixin) self.assertEqual(stratified_sampled._algo, 'StratifiedSample')
class Test(MLTestBase): def setUp(self): super(Test, self).setUp() self.create_iris(IRIS_TABLE) self.df = DataFrame(self.odps.get_table(IRIS_TABLE)) def test_coll_field_operations(self): # select_features self.assertRaises(ValueError, lambda: self.df.select_features()) df2 = self.df.select_features("sepal_length sepal_width petal_length") self.assertEqual( _df_roles(df2), dict(category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width=""), ) df3 = df2.select_features("petal_width", add=True) self.assertEqual( _df_roles(df3), dict( category="", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE", ), ) # exclude_fields self.assertRaises(ValueError, lambda: self.df.exclude_fields()) df4 = df3.exclude_fields("sepal_length sepal_width") self.assertEqual( _df_roles(df4), dict(category="", sepal_width="", sepal_length="", petal_length="FEATURE", petal_width="FEATURE"), ) # weight_field self.assertRaises(ValueError, lambda: self.df.weight_field(None)) df5 = df3.weight_field("sepal_width") self.assertEqual( _df_roles(df5), dict( category="", sepal_width="WEIGHT", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE" ), ) # label_field self.assertRaises(ValueError, lambda: self.df.label_field(None)) df6 = self.df.label_field("category") self.assertEqual( _df_roles(df6), dict( category="LABEL", sepal_width="FEATURE", sepal_length="FEATURE", petal_length="FEATURE", petal_width="FEATURE", ), ) # roles self.assertIs(self.df, self.df.roles()) df7 = self.df.roles(label="category", weight="sepal_width") self.assertEqual( _df_roles(df7), dict( category="LABEL", petal_length="FEATURE", petal_width="FEATURE", sepal_width="WEIGHT", sepal_length="FEATURE", ), ) # discrete df8 = self.df.discrete("sepal_width, sepal_length") self.assertEqual( _df_continuity(df8), dict( category="DISCRETE", sepal_width="DISCRETE", sepal_length="DISCRETE", petal_length="CONTINUOUS", petal_width="CONTINUOUS", ), ) # continuous df9 = df8.continuous("sepal_width") self.assertEqual( _df_continuity(df9), dict( category="DISCRETE", sepal_width="CONTINUOUS", sepal_length="DISCRETE", petal_length="CONTINUOUS", petal_width="CONTINUOUS", ), ) # key_value df10 = self.df.key_value("sepal_length sepal_width") self.assertEqual( _df_key_value(df10), dict( category="", petal_length="", petal_width="", sepal_width="KVConfig(kv=:, item=,)", sepal_length="KVConfig(kv=:, item=,)", ), ) df11 = df10.key_value("sepal_length", kv="-", item=";") self.assertEqual( _df_key_value(df11), dict( category="", petal_length="", petal_width="", sepal_width="KVConfig(kv=:, item=,)", sepal_length="KVConfig(kv=-, item=;)", ), ) # erase_key_value df12 = df10.erase_key_value("sepal_width") self.assertEqual( _df_key_value(df12), dict(category="", petal_length="", petal_width="", sepal_width="", sepal_length="KVConfig(kv=:, item=,)"), ) def test_seq_field_operations(self): seq = self.df.sepal_length # roles seq1 = seq.role("weight") self.assertEqual(_df_roles(seq1), dict(sepal_length="WEIGHT")) # discrete seq2 = seq.discrete() self.assertEqual(_df_continuity(seq2), dict(sepal_length="DISCRETE")) # continuous seq3 = seq.continuous() self.assertEqual(_df_continuity(seq3), dict(sepal_length="CONTINUOUS")) # key_value seq4 = seq.key_value() self.assertEqual(_df_key_value(seq4), dict(sepal_length="KVConfig(kv=:, item=,)")) seq5 = seq4.key_value(kv="-", item=";") self.assertEqual(_df_key_value(seq5), dict(sepal_length="KVConfig(kv=-, item=;)")) # erase_key_value seq6 = seq5.erase_key_value() self.assertEqual(_df_key_value(seq6), dict(sepal_length="")) def test_coll_df_operations(self): from odps.ml.nodes import transform_nodes as tnodes splited = self.df.split(0.75) self.assertEqual(len(splited), 2) self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1])) split_node = adapter_from_df(splited[0])._bind_node self.assertEqual(split_node.code_name, "Split") self.assertEqual(split_node.parameters["fraction"], 0.75) id_appended = self.df.append_id() self.assertEqual( _df_roles(id_appended), dict( category="FEATURE", petal_length="FEATURE", petal_width="FEATURE", sepal_width="FEATURE", sepal_length="FEATURE", append_id="", ), ) append_id_node = adapter_from_df(id_appended)._bind_node self.assertEqual(append_id_node.code_name, "AppendID") self.assertEqual(append_id_node.parameters["IDColName"], "append_id") summary_ep = self.df._create_summary_adapter() summary_node = summary_ep._bind_node self.assertIsInstance(summary_node, tnodes.SummaryNode) def test_dtypes(self): rstrip_lines = lambda s: "\n".join(l.rstrip() for l in s.splitlines()) old_dtypes_repr = rstrip_lines( textwrap.dedent( """ odps.Schema { sepal_length float64 sepal_width float64 petal_length float64 petal_width float64 category string } """ ) ).strip() self.assertEqual(rstrip_lines(repr(self.df.dtypes)).strip(), old_dtypes_repr) new_df = self.df.roles(label="category").key_value("sepal_length") new_dtypes_repr = rstrip_lines( textwrap.dedent( """ odps.Schema { sepal_length KV(':', ',') FEATURE sepal_width float64 FEATURE petal_length float64 FEATURE petal_width float64 FEATURE category string LABEL } """ ) ).strip() self.assertEqual(rstrip_lines(repr(new_df.dtypes)).strip(), new_dtypes_repr) def test_merge(self): self.odps.delete_table(TEMP_TABLE_1_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col11 string, col12 string) lifecycle 1".format(TEMP_TABLE_1_NAME)) self.odps.delete_table(TEMP_TABLE_2_NAME, if_exists=True) self.odps.execute_sql("create table {0} (col21 string, col22 string) lifecycle 1".format(TEMP_TABLE_2_NAME)) df1 = DataFrame(self.odps.get_table(TEMP_TABLE_1_NAME)) df2 = DataFrame(self.odps.get_table(TEMP_TABLE_2_NAME)) self.assertRaises(ValueError, lambda: merge_data(df1)) merged1 = merge_data(df1, df2) self.assertEqual(_df_roles(merged1), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) merged2 = merge_data((df1, "col11"), (df2, "col21", True)) self.assertEqual(_df_roles(merged2), dict(col11="FEATURE", col22="FEATURE")) merged3 = merge_data((df1, "col11"), (df2, "col21", True), auto_rename=True) self.assertEqual(_df_roles(merged3), dict(t0_col11="FEATURE", t1_col22="FEATURE")) merged4 = df1.merge_with(df2) self.assertEqual(_df_roles(merged4), dict(col21="FEATURE", col11="FEATURE", col12="FEATURE", col22="FEATURE")) def test_sample(self): num_sampled = self.df.sample(n=20) adapter = adapter_from_df(num_sampled) self.assertIsInstance(num_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") frac_sampled = self.df.sample(frac=0.5) adapter = adapter_from_df(frac_sampled) self.assertIsInstance(frac_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "RandomSample") weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length) adapter = adapter_from_df(weighted_sampled) self.assertIsInstance(weighted_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "WeightedSample") self.assertEqual(adapter._bind_node.parameters["probCol"], "sepal_length") stratified_sampled = self.df.sample(frac={"Iris-setosa": 0.5}, strata="category") adapter = adapter_from_df(stratified_sampled) self.assertIsInstance(stratified_sampled, DataFrame) self.assertEqual(adapter._bind_node.code_name, "StratifiedSample") def test_batch_persist(self): options.runner.dry_run = False call_seq = [] dfs = [] tables = [] for idx in range(3): write_str = "F%d" % idx def gen_fun(wobj): return lambda _: call_seq.append(wobj) f = gen_fun((write_str, "U")) df_upper = self.mock_action(self.df, action=f) f = gen_fun((write_str, "D")) df_lower = self.mock_action(df_upper, action=f) dfs.append(df_lower) tables.append("TN" + str(idx)) DataFrame.batch_persist(dfs, tables) for idx in range(3): write_str = "F%d" % idx self.assertListEqual([p[1] for p in call_seq if p[0] == write_str], list("UD")) for dir in "UD": self.assertListEqual(sorted(p[0] for p in call_seq if p[1] == dir), ["F0", "F1", "F2"])