def testCachePersist(self):
        expr = self.odps_df

        data2 = [["name1", 3.2], ["name3", 2.4]]

        table_name = tn("pyodps_test_mixed_engine_cp_table2")
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(
            name=table_name, schema=Schema.from_lists(["name", "fid"], ["string", "double"])
        )
        expr2 = DataFrame(table2)
        self.odps.write_table(table2, 0, data2)

        @output(expr.schema.names, expr.schema.types)
        def h(row):
            yield row

        l = expr.filter(expr.id > 0).apply(h, axis=1).cache()
        r = expr2.filter(expr2.fid > 0)
        joined = l.join(r, on=["name", r.fid < 4])["id", "fid"].cache()

        output_table = tn("pyodps_test_mixed_engine_cp_output_table")
        self.odps.delete_table(output_table, if_exists=True)
        schema = Schema.from_lists(["id", "fid"], ["bigint", "double"], ["ds"], ["string"])
        output_t = self.odps.create_table(output_table, schema, if_not_exists=True)

        t = joined.persist(output_table, partition="ds=today", create_partition=True)
        self.assertEqual(len(t.execute()), 2)

        output_t.drop()
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(["name", "id"], datatypes("string", "int64"))
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(["name2", "id2"], datatypes("string", "int64"))
        table2 = MockTable(name="pyodps_test_expr_table2", schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id'], datatypes('string', 'int64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name2', 'id2'], datatypes('string', 'int64'))
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
Ejemplo n.º 4
0
    def testChineseSchema(self):
        s = Schema.from_lists([u'用户'], ['string'], ['分区'], ['bigint'])
        self.assertIn('用户', s)
        self.assertEqual(s.get_column('用户').type.name, 'string')
        self.assertEqual(s.get_partition(u'分区').type.name, 'bigint')
        self.assertEqual(s['用户'].type.name, 'string')
        self.assertEqual(s[u'分区'].type.name, 'bigint')

        s2 = Schema.from_lists(['用户'], ['string'], [u'分区'], ['bigint'])
        self.assertEqual(s, s2)
    def setup(self):
        schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        table._client = self.config.odps.rest
        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64],
                                    ['part1', 'part2'], [types.string, types.int64])
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        table2._client = self.config.odps.rest
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)
    def testTableResource(self):
        test_table_name = tn('pyodps_t_tmp_resource_table')
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'])
        self.odps.delete_table(test_table_name, if_exists=True)
        self.odps.create_table(test_table_name, schema)

        resource_name = tn('pyodps_t_tmp_table_resource')
        try:
            self.odps.delete_resource(resource_name)
        except errors.NoSuchObject:
            pass
        res = self.odps.create_resource(resource_name, 'table', table_name=test_table_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())
        self.assertIs(res, self.odps.get_resource(resource_name))

        del res.parent[resource_name]  # delete from cache

        self.assertIsNot(res, self.odps.get_resource(resource_name))
        res = self.odps.get_resource(resource_name)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertIsNone(res.get_source_table_partition())

        test_table_name = tn('pyodps_t_tmp_resource_table')
        test_table_partition = 'pt=test,sec=1'
        schema = Schema.from_lists(['id', 'name'], ['string', 'string'], ['pt', 'sec'], ['string', 'bigint'])
        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        table.create_partition(test_table_partition)

        resource_name = tn('pyodps_t_tmp_table_resource')
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        test_table_partition = 'pt=test,sec=2'
        table.create_partition(test_table_partition)
        res = res.update(partition=test_table_partition)
        self.assertIsInstance(res, TableResource)
        self.assertEqual(res.get_source_table().name, test_table_name)
        self.assertEqual(str(res.get_source_table_partition()),
                         str(types.PartitionSpec(test_table_partition)))
        self.assertIs(res, self.odps.get_resource(resource_name))

        self.odps.delete_resource(resource_name)
        self.odps.delete_table(test_table_name)
    def testReadWriteTable(self):
        test_table_name = 'pyodps_t_tmp_read_write_table'
        schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        data = [[111, 'aaa', True],
                [222, 'bbb', False],
                [333, 'ccc', True],
                [444, '中文', False]]
        length = len(data)
        records = [Record(schema=schema, values=values) for values in data]

        texted_data = [[it[0], to_str(it[1]), it[2]] for it in data]

        self.odps.write_table(table, 0, records)
        self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)])
        self.assertSequenceEqual(texted_data[::2],
                                 [record.values for record in self.odps.read_table(table, length, step=2)])

        self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)])

        self.odps.delete_table(test_table_name)
        self.assertFalse(self.odps.exist_table(test_table_name))
    def testCreateDeleteTable(self):
        test_table_name = 'pyodps_t_tmp_create_table'
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5],
            datatypes('string', 'int64', 'float64', 'boolean', 'datetime',
                      'decimal')[:5])
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = SeahawksEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
    def _create_partitioned_table(self, table_name):
        fields = ['id', 'int_num', 'float_num', 'bool']
        types = ['string', 'bigint', 'double', 'boolean']

        self.odps.delete_table(table_name, if_exists=True)
        return self.odps.create_table(table_name,
                                      schema=Schema.from_lists(fields, types, ['ds'], ['string']))
    def testMakeKV(self):
        from odps import types as odps_types
        data = [
            ['name1', 1.0, 3.0, None, 10.0, None, None],
            ['name1', None, 3.0, 5.1, None, None, None],
            ['name1', 7.1, None, None, None, 8.2, None],
            ['name2', None, 1.2, 1.5, None, None, None],
            ['name2', None, 1.0, None, None, None, 1.1],
        ]
        kv_cols = ['k1', 'k2', 'k3', 'k5', 'k7', 'k9']
        schema = Schema.from_lists(['name'] + kv_cols, [odps_types.string] +
                                   [odps_types.double] * 6)
        table_name = tn('pyodps_test_engine_make_kv')
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(name=table_name, schema=schema)
        expr = CollectionExpr(_source_data=table,
                              _schema=odps_schema_to_df_schema(schema))
        try:
            self.odps.write_table(table, 0, data)
            expr1 = expr.to_kv(columns=kv_cols, kv_delim='=')

            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [
                ['name1', 'k1=1,k2=3,k5=10'],
                ['name1', 'k2=3,k3=5.1'],
                ['name1', 'k1=7.1,k7=8.2'],
                ['name2', 'k2=1.2,k3=1.5'],
                ['name2', 'k2=1,k9=1.1'],
            ]

            self.assertListEqual(result, expected)
        finally:
            table.drop()
    def testAXFException(self):
        import sqlalchemy

        data = [
            ['name1', 4, 5.3, None, None],
            ['name2', 2, 3.5, None, None],
            ['name1', 4, 4.2, None, None],
            ['name1', 3, 2.2, None, None],
            ['name1', 3, 4.1, None, None],
        ]
        self._gen_data(data=data)

        table_name = tn('pyodps_test_engine_axf_seahawks_table')

        try:
            schema = Schema.from_lists(self.schema.names, self.schema.types,
                                       ['ds'], ['string'])
            self.odps.create_table(table_name, schema)
            df = self.engine.persist(self.expr,
                                     table_name,
                                     partition='ds=today',
                                     create_partition=True)

            with self.assertRaises(sqlalchemy.exc.DatabaseError):
                self.engine.execute(df.input)
        finally:
            self.odps.delete_table(table_name, if_exists=True)
Ejemplo n.º 13
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
    def testPartitions(self):
        test_table_name = tn('pyodps_t_tmp_partitions_table')
        partitions = ['s=%s' % i for i in range(3)]
        schema = Schema.from_lists([
            'id',
        ], [
            'string',
        ], [
            's',
        ], [
            'string',
        ])

        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        for partition in partitions:
            table.create_partition(partition)

        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        table.get_partition(partitions[0]).drop()
        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        p = next(table.partitions)
        self.assertGreater(len(p.columns), 0)
        p.reload()
        self.assertGreater(len(p.columns), 0)

        self.odps.delete_table(test_table_name)
Ejemplo n.º 15
0
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = 'pyodps_df_mixed'
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
Ejemplo n.º 16
0
 def setup(self):
     from odps.df.expr.tests.core import MockTable
     schema = Schema.from_lists(types._data_types.keys(),
                                types._data_types.values())
     self.expr = CollectionExpr(_source_data=None, _schema=schema)
     self.sourced_expr = CollectionExpr(
         _source_data=MockTable(client=self.odps.rest), _schema=schema)
Ejemplo n.º 17
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(["name", "id", "fid"], datatypes("string", "int64", "float64"))
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
        self.ctx = ExecuteContext()
    def testReadSQLWrite(self):
        test_table = tn('pyodps_t_tmp_read_sql_instance_write')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(test_table,
                                       schema=Schema.from_lists(['size'],
                                                                ['bigint']),
                                       if_not_exists=True)
        self.odps.write_table(table, 0,
                              [table.new_record([1]),
                               table.new_record([2])])
        self.odps.write_table(table, [
            table.new_record([3]),
        ])

        test_table2 = tn('pyodps_t_tmp_read_sql_instance_write2')
        self.odps.delete_table(test_table2, if_exists=True)
        table2 = self.odps.create_table(test_table2, table.schema)

        try:
            with self.odps.execute_sql('select * from %s' %
                                       test_table).open_reader() as reader:
                with table2.open_writer() as writer:
                    for record in reader:
                        writer.write(table2.new_record(record.values))
        finally:
            table.drop()
            table2.drop()
    def testReadBinarySQLInstance(self):
        try:
            options.tunnel.string_as_binary = True
            test_table = tn('pyodps_t_tmp_read_binary_sql_instance')
            self.odps.delete_table(test_table, if_exists=True)
            table = self.odps.create_table(
                test_table,
                schema=Schema.from_lists(['size', 'name'],
                                         ['bigint', 'string']),
                if_not_exists=True)

            data = [[
                1, u'中'.encode('utf-8') + b'\\\\n\\\n' + u'文'.encode('utf-8') +
                b' ,\r\xe9'
            ],
                    [
                        2, u'测试'.encode('utf-8') + b'\x00\x01\x02' +
                        u'数据'.encode('utf-8') + b'\xe9'
                    ]]
            self.odps.write_table(table, 0,
                                  [table.new_record(it) for it in data])

            with self.odps.execute_sql(
                    'select name from %s' %
                    test_table).open_reader(tunnel=False) as reader:
                read_data = sorted([r[0] for r in reader])
                expected_data = sorted([r[1] for r in data])

                self.assertSequenceEqual(read_data, expected_data)

            table.drop()
        finally:
            options.tunnel.string_as_binary = False
    def testSubPartitions(self):
        test_table_name = tn('pyodps_t_tmp_sub_partitions_table')
        root_partition = 'type=test'
        sub_partitions = ['s=%s' % i for i in range(3)]
        schema = Schema.from_lists([
            'id',
        ], [
            'string',
        ], ['type', 's'], ['string', 'string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        table = self.odps.create_table(test_table_name, schema)
        partitions = [root_partition + ',' + p for p in sub_partitions]
        partitions.append('type=test2,s=0')
        for partition in partitions:
            table.create_partition(partition)

        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        self.assertEqual(len(list(table.iterate_partitions(root_partition))),
                         3)

        table.delete_partition(partitions[0])
        self.assertEqual(
            sorted([str(types.PartitionSpec(p)) for p in partitions[1:]]),
            sorted([str(p.partition_spec) for p in table.partitions]))

        self.odps.delete_table(test_table_name)
Ejemplo n.º 21
0
    def testRecordSetAndGetByIndex(self):
        s = Schema.from_lists(['col%s' % i for i in range(8)], [
            'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal',
            'array<string>', 'map<string,bigint>'
        ])
        s.build_snapshot()
        if options.force_py:
            self.assertIsNone(s._snapshot)
        else:
            self.assertIsNotNone(s._snapshot)

        r = Record(schema=s)
        r[0] = 1
        r[1] = 1.2
        r[2] = 'abc'
        r[3] = datetime(2016, 1, 1)
        r[4] = True
        r[5] = _decimal.Decimal('1.111')
        r[6] = ['a', 'b']
        r[7] = OrderedDict({'a': 1})
        self.assertSequenceEqual(r.values, [
            1, 1.2, 'abc',
            datetime(2016, 1, 1), True,
            _decimal.Decimal('1.111'), ['a', 'b'],
            OrderedDict({'a': 1})
        ])
        self.assertEqual(1, r[0])
        self.assertEqual(1.2, r[1])
        self.assertEqual('abc', r[2])
        self.assertEqual(datetime(2016, 1, 1), r[3])
        self.assertEqual(True, r[4])
        self.assertEqual(_decimal.Decimal('1.111'), r[5])
        self.assertEqual(['a', 'b'], r[6])
        self.assertEqual(OrderedDict({'a': 1}), r[7])
        self.assertEqual([1, 1.2], r[:2])
Ejemplo n.º 22
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
Ejemplo n.º 23
0
    def testRoomStores(self):
        class FakeRoom(Room):
            def _init(self):
                return

        room = FakeRoom("__test")
        room._room_dir = tempfile.mkdtemp()

        try:
            s = Schema.from_lists(["name", "id"], ["string", "bigint"])
            table_name = "pyodps_test_room_stores"
            self.odps.delete_table(table_name, if_exists=True)
            t = self.odps.create_table(table_name, s)
            data = [["name1", 1], ["name2", 2]]
            with t.open_writer() as writer:
                writer.write(data)

            del t

            t = self.odps.get_table(table_name)
            self.assertEqual(t.schema.names, ["name", "id"])

            try:
                room.store("table", t)

                t2 = room["table"]
                self.assertEqual(t2.name, table_name)

                with t2.open_reader() as reader:
                    values = [r.values for r in reader]
                    self.assertEqual(data, values)
            finally:
                t.drop()
        finally:
            shutil.rmtree(room._room_dir)
Ejemplo n.º 24
0
    def testCreateDeleteTable(self):
        test_table_name = tn('pyodps_t_tmp_create_table')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'], ['ds', ], ['string',])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
Ejemplo n.º 25
0
    def testReadWriteTable(self):
        test_table_name = tn('pyodps_t_tmp_read_write_table')
        schema = Schema.from_lists(['id', 'name', 'right'], ['bigint', 'string', 'boolean'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema)
        data = [[111, 'aaa', True],
                [222, 'bbb', False],
                [333, 'ccc', True],
                [444, '中文', False]]
        length = len(data)
        records = [Record(schema=schema, values=values) for values in data]

        texted_data = [[it[0], to_str(it[1]), it[2]] for it in data]

        self.odps.write_table(table, 0, records)
        self.assertSequenceEqual(texted_data, [record.values for record in self.odps.read_table(table, length)])
        self.assertSequenceEqual(texted_data[::2],
                                 [record.values for record in self.odps.read_table(table, length, step=2)])

        self.assertSequenceEqual(texted_data, [record.values for record in table.head(length)])

        self.odps.delete_table(test_table_name)
        self.assertFalse(self.odps.exist_table(test_table_name))
Ejemplo n.º 26
0
    def testCreateDeleteTable(self):
        test_table_name = tn("pyodps_t_tmp_create_table")
        schema = Schema.from_lists(["id", "name"], ["bigint", "string"], ["ds"], ["string"])

        tables = self.odps._project.tables

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = tables.create(test_table_name, schema, lifecycle=10)

        self.assertIsNone(table._getattr("owner"))
        self.assertIsNotNone(table.owner)

        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertEqual(table.lifecycle, 10)

        tables.delete(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))

        table = self.odps.create_table(test_table_name, schema, shard_num=10, hub_lifecycle=5)
        self.assertEqual(table.name, test_table_name)
        self.assertEqual(table.schema, schema)
        self.assertNotEqual(table.lifecycle, 10)
        self.assertEqual(table.shard.shard_num, 10)

        self.odps.delete_table(test_table_name, if_exists=True)
        self.assertFalse(self.odps.exist_table(test_table_name))
Ejemplo n.º 27
0
    def _create_table(self, table_name):
        fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm']
        types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal',
                 'array<string>', 'map<string,bigint>']

        self.odps.delete_table(table_name, if_exists=True)
        return self.odps.create_table(table_name, schema=Schema.from_lists(fields, types))
    def testUnion(self):
        expr = self.odps_df.union(self.pd_df).sort(['id', 'name'])
        result = self.engine.execute(expr).values

        df = DataFrame(self.odps_df.to_pandas())
        expected = self.pd_engine.execute(
            df.union(self.pd_df).sort(['id', 'name'])).values
        self.assertTrue(result.equals(expected))

        schema = Schema.from_lists(
            [c.name for c in self.t.schema.columns if c.name != 'name'],
            [c.type for c in self.t.schema.columns if c.name != 'name'],
            ['name'], ['string'])
        t = self.odps.create_table(
            'tmp_pyodps_%s' % str(uuid.uuid4()).replace('-', '_'), schema)
        try:
            expr = self.odps_df.union(self.pd_df)
            expr.persist(t.name, create_table=False, partitions=['name'])

            self.assertEqual(self.engine.execute(DataFrame(t).count()), 5)

            self.engine._selecter.force_odps = False
            df = DataFrame(t)
            self.assertGreaterEqual(
                len(
                    self.engine.execute(df.filter(df.name > 'a',
                                                  df.name < 'b'))), 0)
        finally:
            t.drop()
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [['name1', 5], ['name2', 6]]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed_%d' % os.getpid())
        if self.odps.exist_table(table):
            self.t = self.odps.get_table(table)
        else:
            self.t = self.odps.create_table(table,
                                            Schema.from_lists(names, types),
                                            lifecycle=1)
            with self.t.open_writer() as w:
                w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
Ejemplo n.º 30
0
 def setUp(self):
     TestBase.setUp(self)
     self.pr = cProfile.Profile()
     self.pr.enable()
     fields = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
     types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
     self.SCHEMA = Schema.from_lists(fields, types)
Ejemplo n.º 31
0
    def _create_table(self, table_name):
        fields = ['id', 'int_num', 'float_num', 'dt', 'bool', 'dec', 'arr', 'm']
        types = ['string', 'bigint', 'double', 'datetime', 'boolean', 'decimal',
                 'array<string>', 'map<string,bigint>']

        self.odps.delete_table(table_name, if_exists=True)
        return self.odps.create_table(table_name, schema=Schema.from_lists(fields, types))
    def setup(self):
        import pandas as pd

        odps_data = [
            ['name1', 1],
            ['name2', 2],
            ['name1', 3],
        ]

        pd_data = [
            ['name1', 5],
            ['name2', 6]
        ]

        names = ['name', 'id']
        types = ['string', 'bigint']

        table = tn('pyodps_df_mixed')
        self.odps.delete_table(table, if_exists=True)
        self.t = self.odps.create_table(table, Schema.from_lists(names, types))
        with self.t.open_writer() as w:
            w.write([self.t.new_record(r) for r in odps_data])

        self.odps_df = DataFrame(self.t)
        self.pd_df = DataFrame(pd.DataFrame(pd_data, columns=names))

        self.engine = MixedEngine(self.odps)
        self.pd_engine = PandasEngine(self.odps)
Ejemplo n.º 33
0
    def testBloomFilter(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        data2 = [
            ['name1'],
            ['name3']
        ]

        self._gen_data(data=data)

        schema2 = Schema.from_lists(['name', ], [types.string])

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.bloom_filter('name', expr2[:1].name, capacity=10)

        res = self.engine.execute(expr)
        result = self._get_result(res)

        self.assertTrue(all(r[0] != 'name2' for r in result))
Ejemplo n.º 34
0
    def _register_reader(self):
        controller = CupidRpcController()
        channel = SandboxRpcChannel()
        stub = subprocess_pb.CupidSubProcessService_Stub(channel)

        req = subprocess_pb.RegisterTableReaderRequest(inputTableHandle=self._handle,
                                                       inputSplit=self.split_proto)
        resp = stub.RegisterTableReader(controller, req, None)
        if controller.Failed():
            raise CupidError(controller.ErrorText())

        logger.info("RegisterTableReader response: %s", resp)
        logger.info("RegisterTableReaderResponse protobuf field size = %d", len(resp.ListFields()))

        schema_json = json.loads(resp.schema)
        partition_schema_json = json.loads(resp.partitionSchema) \
            if resp.HasField('partitionSchema') else None

        schema_names = [d['name'] for d in schema_json]
        schema_types = [d['type'] for d in schema_json]
        pt_schema_names = [d['name'] for d in partition_schema_json]
        pt_schema_types = [d['type'] for d in partition_schema_json]
        schema = Schema.from_lists(schema_names, schema_types, pt_schema_names, pt_schema_types)

        return resp.readIterator, schema
Ejemplo n.º 35
0
    def testReadMapArraySQLInstance(self):
        test_table = tn('pyodps_t_tmp_read_map_array_sql_instance')
        self.odps.delete_table(test_table, if_exists=True)
        table = self.odps.create_table(
            test_table,
            schema=Schema.from_lists(
                ['idx', 'map_col', 'array_col'],
                ['bigint', odps_types.Map(odps_types.string, odps_types.string), odps_types.Array(odps_types.string)],
            )
        )

        data = [
            [0, {'key1': 'value1', 'key2': 'value2'}, ['item1', 'item2', 'item3']],
            [1, {'key3': 'value3', 'key4': 'value4'}, ['item4', 'item5']],
        ]
        self.odps.write_table(test_table, data)

        with self.odps.execute_sql('select * from %s' % test_table).open_reader(table.schema) as reader:
            read_data = [list(r.values) for r in reader]
            read_data = sorted(read_data, key=lambda r: r[0])
            expected_data = sorted(data, key=lambda r: r[0])

            self.assertSequenceEqual(read_data, expected_data)

        table.drop()
 def testRecordSetAndGetByName(self):
     s = Schema.from_lists(['col%s' % i for i in range(8)], [
         'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal',
         'array<string>', 'map<string,bigint>'
     ])
     r = Record(schema=s)
     r['col0'] = 1
     r['col1'] = 1.2
     r['col2'] = 'abc'
     r['col3'] = datetime(2016, 1, 1)
     r['col4'] = True
     r['col5'] = _decimal.Decimal('1.111')
     r['col6'] = ['a', 'b']
     r['col7'] = OrderedDict({'a': 1})
     self.assertSequenceEqual(r.values, [
         1, 1.2, 'abc',
         datetime(2016, 1, 1), True,
         _decimal.Decimal('1.111'), ['a', 'b'],
         OrderedDict({'a': 1})
     ])
     self.assertEquals(1, r['col0'])
     self.assertEquals(1.2, r['col1'])
     self.assertEquals('abc', r['col2'])
     self.assertEquals(datetime(2016, 1, 1), r['col3'])
     self.assertEquals(True, r['col4'])
     self.assertEquals(_decimal.Decimal('1.111'), r['col5'])
     self.assertEquals(['a', 'b'], r['col6'])
     self.assertEquals(OrderedDict({'a': 1}), r['col7'])
 def testNullableRecord(self):
     s = Schema.from_lists(['col%s' % i for i in range(8)], [
         'bigint', 'double', 'string', 'datetime', 'boolean', 'decimal',
         'array<string>', 'map<string,bigint>'
     ])
     r = Record(schema=s, values=[None] * 8)
     self.assertSequenceEqual(r.values, [None] * 8)
Ejemplo n.º 38
0
    def testPersistExecute(self):
        delay = Delay()
        filtered = self.df[self.df.id > 0].cache()

        persist_table_name = tn('pyodps_test_delay_persist')
        schema = Schema.from_lists(['id', 'name', 'value'],
                                   ['bigint', 'string', 'bigint'],
                                   ['pt', 'ds'], ['string', 'string'])
        self.odps.delete_table(persist_table_name, if_exists=True)
        self.odps.create_table(persist_table_name, schema)

        future1 = filtered[filtered.value > 2].persist(persist_table_name,
                                                       partition='pt=a,ds=d1',
                                                       delay=delay)
        future2 = filtered[filtered.value < 2].persist(persist_table_name,
                                                       partition='pt=a,ds=d2',
                                                       delay=delay)

        delay.execute()
        df1 = future1.result()
        df2 = future2.result()

        self.assertEqual([c.lhs.name for c in df1.predicate.children()],
                         ['pt', 'ds'])
        result1 = self._get_result(df1.execute())
        self.assertEqual([r[:-2] for r in result1],
                         [d for d in self.data if d[2] > 2])
        self.assertEqual([c.lhs.name for c in df2.predicate.children()],
                         ['pt', 'ds'])
        result2 = self._get_result(df2.execute())
        self.assertEqual([r[:-2] for r in result2],
                         [d for d in self.data if d[2] < 2])
Ejemplo n.º 39
0
 def testNullableRecord(self):
     s = Schema.from_lists(
         ['col%s'%i for i in range(8)],
         ['bigint', 'double', 'string', 'datetime', 'boolean', 'decimal',
          'array<string>', 'map<string,bigint>'])
     r = Record(schema=s, values=[None]*8)
     self.assertSequenceEqual(r.values, [None]*8)
Ejemplo n.º 40
0
    def testJoinGroupby(self):
        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.int64, types.int64])

        self._gen_data(data=data)

        data2 = [
            ['name1', 4, -1],
            ['name2', 1, -2]
        ]

        import pandas as pd
        expr2 = CollectionExpr(_source_data=pd.DataFrame(data2, columns=schema2.names),
                               _schema=schema2)

        expr = self.expr.join(expr2, on='name')[self.expr]
        expr = expr.groupby('id').agg(expr.fid.sum())

        res = self.engine.execute(expr)
        result = self._get_result(res)

        expected = pd.DataFrame(data, columns=self.expr.schema.names).groupby('id').agg({'fid': 'sum'})
        self.assertEqual(expected.reset_index().values.tolist(), result)
Ejemplo n.º 41
0
    def testListInstancesInPage(self):
        test_table = tn('pyodps_t_tmp_list_instances_in_page')

        delay_udf = textwrap.dedent("""
        from odps.udf import annotate
        import sys
        import time

        @annotate("bigint->bigint")
        class Delayer(object):
           def evaluate(self, arg0):
               print('Start Logging')
               sys.stdout.flush()
               time.sleep(45)
               print('End Logging')
               sys.stdout.flush()
               return arg0
        """)
        resource_name = tn('test_delayer_function_resource')
        function_name = tn('test_delayer_function')

        if self.odps.exist_resource(resource_name + '.py'):
            self.odps.delete_resource(resource_name + '.py')
        res = self.odps.create_resource(resource_name + '.py', 'py', file_obj=delay_udf)

        if self.odps.exist_function(function_name):
            self.odps.delete_function(function_name)
        fun = self.odps.create_function(function_name, class_type=resource_name + '.Delayer', resources=[res, ])

        data = [[random.randint(0, 1000)] for _ in compat.irange(100)]
        self.odps.delete_table(test_table, if_exists=True)
        t = self.odps.create_table(test_table, Schema.from_lists(['num'], ['bigint']))
        self.odps.write_table(t, data)

        instance = self.odps.run_sql("select sum({0}(num)), 1 + '1' as warn_col from {1} group by num"
                                     .format(function_name, test_table))

        try:
            self.assertEqual(instance.status, Instance.Status.RUNNING)
            self.assertIn(instance.id, [it.id for it in self.odps.get_project().instances.iterate(
                status=Instance.Status.RUNNING,
                from_time=datetime.now()-timedelta(days=2),
                end_time=datetime.now()+timedelta(days=1), max_items=20)])

            self.waitContainerFilled(lambda: instance.tasks)
            task = instance.tasks[0]
            task.put_info('testInfo', 'TestInfo')
            self.assertIsNotNone(task.warnings)

            self.waitContainerFilled(lambda: task.workers, 30)
            self.assertIsNotNone(task.workers[0].get_log('stdout'))
        finally:
            try:
                instance.stop()
            except:
                pass
            res.drop()
            fun.drop()
            t.drop()
    def setup(self):
        schema = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        table._client = self.config.odps.rest
        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        schema2 = Schema.from_lists(['name', 'id', 'fid'], [types.string, types.int64, types.float64],
                                    ['part1', 'part2'], [types.string, types.int64])
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        table2._client = self.config.odps.rest
        self.expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'],
                                    [types.int64, types.string, types.Dict(types.string, types.string),
                                     types.List(types.string)])
        table3 = MockTable(name='pyodps_test_expr_table3', schema=schema3)
        self.expr3 = CollectionExpr(_source_data=table3, _schema=schema3)
Ejemplo n.º 43
0
    def testGetAttrs(self):
        schema = Schema.from_lists(['name', 'id'], [types.string, types.int64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        expected = ('_lhs', '_rhs', '_data_type', '_source_data_type', '_name',
                    '_source_name', '_engine', '_cached_args')
        self.assertSequenceEqual(expected, get_attrs(expr.id + 1))
Ejemplo n.º 44
0
    def testRecordMultiFields(self):
        s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint'])
        r = Record(values=[1, 2], schema=s)

        self.assertEqual(r['col1', 'col2'], ['1', 2])

        self.assertRaises(AttributeError, lambda: r['col3'])
        self.assertRaises(AttributeError, lambda: r['col3', ])
Ejemplo n.º 45
0
    def testGetAttrs(self):
        schema = Schema.from_lists(['name', 'id'], [types.string, types.int64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        expected = ('_lhs', '_rhs', '_data_type', '_source_data_type', '_name',
                    '_source_name', '_engine', '_cache_data', '_need_cache', '_cached_args')
        self.assertSequenceEqual(expected, get_attrs(expr.id + 1))
Ejemplo n.º 46
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = DynamicSchema.from_schema(
            Schema.from_lists(
                ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                          'datetime')))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        schema2 = DynamicSchema.from_schema(Schema.from_lists(
            ['name2', 'id', 'fid2'], datatypes('string', 'int64', 'float64')),
                                            default_type=types.string)
        table2 = MockTable(name='pyodps_test_expr_tabl2', schema=schema2)

        self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema)
        self.expr2 = DynamicCollectionExpr(_source_data=table2,
                                           _schema=schema2)
Ejemplo n.º 47
0
    def testCreateTableWithChineseColumn(self):
        test_table_name = tn("pyodps_t_tmp_create_table_with_chinese_columns")
        schema = Schema.from_lists(["序列", "值"], ["bigint", "string"], ["ds"], ["string"])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        self.assertSequenceEqual([col.name for col in table.schema.columns], [col.name for col in schema.columns])
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                   datatypes('string', 'int64', 'float64'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
        self.ctx = ExecuteContext()
Ejemplo n.º 49
0
    def testRecordMultiFields(self):
        s = Schema.from_lists(['col1', 'col2'], ['string', 'bigint'])
        r = Record(values=[1, 2], schema=s)

        self.assertEqual(r['col1', 'col2'], ['1', 2])

        self.assertRaises(AttributeError, lambda: r['col3'])
        self.assertRaises(AttributeError, lambda: r['col3', ])
Ejemplo n.º 50
0
    def testSetitemField(self):
        from odps.df.expr.groupby import GroupByCollectionExpr
        from odps.df.expr.merge import JoinFieldMergedCollectionExpr

        expr = self.expr.copy()

        expr['new_id'] = expr.id + 1

        self.assertIn('new_id', expr.schema.names)
        self.assertIs(expr._fields[-1].lhs.input, expr.input)

        self.assertEqual(expr.schema.names, ['name', 'id', 'fid', 'new_id'])

        expr['new_id2'] = expr.id + 2

        self.assertIn('new_id2', expr.schema.names)
        self.assertIs(expr._fields[-1].lhs.input, expr.input)

        self.assertEqual(expr.schema.names,
                         ['name', 'id', 'fid', 'new_id', 'new_id2'])
        self.assertIsNone(expr._input._proxy)

        expr['new_id2'] = expr.new_id

        expr['new_id3'] = expr.id + expr.new_id2
        self.assertIs(expr._fields[-1].lhs.input, expr.input)
        self.assertIs(expr._fields[-1].rhs.lhs.input, expr.input)

        self.assertIsInstance(expr, ProjectCollectionExpr)
        self.assert_(isinstance(expr, ProjectCollectionExpr))

        expr2 = expr.groupby('name').agg(expr.id.sum())
        expr2['new_id2'] = expr2.id_sum + 1
        self.assertIsInstance(expr2, ProjectCollectionExpr)
        self.assertNotIsInstance(expr2, GroupByCollectionExpr)
        self.assertNotIsInstance(expr2, FilterCollectionExpr)

        schema = Schema.from_lists(
            ['name', 'id', 'fid2', 'fid3'],
            [types.string, types.int64, types.float64, types.float64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        table._client = self.config.odps.rest
        expr3 = CollectionExpr(_source_data=table, _schema=schema)

        expr4 = expr.left_join(
            expr3,
            on=[expr.name == expr3.name, expr.id == expr3.id],
            merge_columns=True)
        expr4['fid_1'] = expr4.groupby('id').sort('fid2').row_number()
        self.assertIsInstance(expr4, JoinFieldMergedCollectionExpr)
        self.assertIsNone(expr4._proxy)

        expr5 = expr[expr]
        expr5['name_2'] = expr5.apply(lambda row: row.name,
                                      axis=1,
                                      reduce=True)
        self.assertIsInstance(expr5, ProjectCollectionExpr)
        self.assertIsNone(expr5._proxy)
Ejemplo n.º 51
0
    def uploadCSV(self, csvFilename, tableName, sep=",", pt=None):
        """
        :param csvFilename: 传入本地csv的路径,必须要有表头
        :param tableName:  上传到odps时的表名
        :param sep:   csv的分隔符
        :param pt:   是否创建分区
        """
        print("start upload ...\n")
        df = pd.read_csv(csvFilename, sep=sep)
        shape0 = df.shape[0]
        columns = [
            Column(name=f"{x}", type='string', comment='the column')
            for x in df.columns
        ]

        if pt:
            partitions = [
                Partition(name='pt', type='string', comment='the partition')
            ]
            schema = Schema(columns=columns, partitions=partitions)
            table = self.creat_table(tableName, schema)
            table.create_partition(f"pt={pt}", if_not_exists=True)
            table_columns = [i.name for i in table.schema.columns]
            with table.open_writer(partition=f"pt={pt}") as writer:
                for index in df.index:
                    print(f"{index+1}/{shape0} in {tableName}  ...")
                    item_dict = dict(df.loc[index])
                    item = []
                    for field in table_columns[:-1]:
                        item.append(item_dict.get(field, ''))
                    item.append(pt)
                    writer.write(item)
        else:
            schema = Schema(columns=columns)
            table = self.creat_table(tableName, schema)
            table_columns = [i.name for i in table.schema.columns]
            with table.open_writer(partition=None) as writer:
                for index in df.index:
                    print(f"{index+1}/{shape0} in {tableName}  ...")
                    item_dict = dict(df.loc[index])
                    item = []
                    for field in table_columns[:-1]:
                        item.append(item_dict.get(field, ''))
                    writer.write(item)
        print("\n\n upload finish ...")
Ejemplo n.º 52
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'), ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table,
                                   _schema=Schema(columns=schema.columns))

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1,
                                    _schema=Schema(columns=schema.columns))

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2,
                                    _schema=Schema(columns=schema.columns))

        schema2 = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'],
                                    datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3,
                                    _schema=Schema(columns=schema2.columns))

        schema3 = Schema.from_lists(['id', 'name', 'relatives', 'hobbies'],
                                    datatypes('int64', 'string',
                                              'dict<string, string>',
                                              'list<string>'))
        table4 = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr4 = CollectionExpr(_source_data=table4, _schema=schema3)
Ejemplo n.º 53
0
    def testCreateTableWithChineseColumn(self):
        test_table_name = tn('pyodps_t_tmp_create_table_with_chinese_columns')
        schema = Schema.from_lists(['序列', '值'], ['bigint', 'string'], ['ds', ], ['string',])

        self.odps.delete_table(test_table_name, if_exists=True)

        table = self.odps.create_table(test_table_name, schema)
        self.assertSequenceEqual([col.name for col in table.schema.columns],
                                 [col.name for col in schema.columns])
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'),
                                   ['ds'], datatypes('string'))
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        self.expr = CollectionExpr(_source_data=table, _schema=Schema(columns=schema.columns))

        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema)
        self.expr1 = CollectionExpr(_source_data=table1, _schema=Schema(columns=schema.columns))

        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema)
        self.expr2 = CollectionExpr(_source_data=table2, _schema=Schema(columns=schema.columns))

        schema2 = Schema.from_lists(['name', 'id', 'fid'], datatypes('string', 'int64', 'float64'),
                                    ['part1', 'part2'], datatypes('string', 'int64'))
        table3 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        self.expr3 = CollectionExpr(_source_data=table3, _schema=Schema(columns=schema2.columns))
Ejemplo n.º 55
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = DynamicSchema.from_schema(
            Schema.from_lists(
                ["name", "id", "fid", "isMale", "scale", "birth"],
                datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"),
            )
        )
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        schema2 = DynamicSchema.from_schema(
            Schema.from_lists(["name2", "id", "fid2"], datatypes("string", "int64", "float64")),
            default_type=types.string,
        )
        table2 = MockTable(name="pyodps_test_expr_tabl2", schema=schema2)

        self.expr = DynamicCollectionExpr(_source_data=table, _schema=schema)
        self.expr2 = DynamicCollectionExpr(_source_data=table2, _schema=schema2)
Ejemplo n.º 56
0
    def setup(self):
        test_table_name = tn('pyodps_test_dataframe')
        schema = Schema.from_lists(['id', 'name'], ['bigint', 'string'])

        self.odps.delete_table(test_table_name, if_exists=True)
        self.table = self.odps.create_table(test_table_name, schema)

        with self.table.open_writer() as w:
            w.write([[1, 'name1'], [2, 'name2'], [3, 'name3']])
Ejemplo n.º 57
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ["name", "id", "fid", "isMale", "scale", "birth"],
            datatypes("string", "int64", "float64", "boolean", "decimal", "datetime"),
        )
        table = MockTable(name="pyodps_test_expr_table", schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)
    def testPivot(self):
        data = [["name1", 1, 1.0, True], ["name1", 2, 2.0, True], ["name2", 1, 3.0, False], ["name2", 3, 4.0, False]]

        table_name = tn("pyodps_test_mixed_engine_pivot")
        self.odps.delete_table(table_name, if_exists=True)
        table = self.odps.create_table(
            name=table_name,
            schema=Schema.from_lists(["name", "id", "fid", "ismale"], ["string", "bigint", "double", "boolean"]),
        )
        expr = DataFrame(table)
        try:
            self.odps.write_table(table, 0, data)

            expr1 = expr.pivot(rows="id", columns="name", values="fid").distinct()
            res = self.engine.execute(expr1)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0], [2, 2.0, None], [3, None, 4.0]]
            self.assertEqual(sorted(result), sorted(expected))

            expr2 = expr.pivot(rows="id", columns="name", values=["fid", "ismale"])
            res = self.engine.execute(expr2)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, True, False], [2, 2.0, None, True, None], [3, None, 4.0, None, False]]
            self.assertEqual(sorted(result), sorted(expected))

            expr3 = expr.pivot(rows="id", columns="name", values="fid")["name3"]
            with self.assertRaises(ValueError) as cm:
                self.engine.execute(expr3)
            self.assertIn("name3", str(cm.exception))

            expr4 = expr.pivot(rows="id", columns="name", values="fid")["id", "name1"]
            res = self.engine.execute(expr4)
            result = self._get_result(res)

            expected = [[1, 1.0], [2, 2.0], [3, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr5 = expr.pivot(rows="id", columns="name", values="fid")
            expr5 = expr5[expr5, (expr5["name1"].astype("int") + 1).rename("new_name")]
            res = self.engine.execute(expr5)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, 2.0], [2, 2.0, None, 3.0], [3, None, 4.0, None]]
            self.assertEqual(sorted(result), sorted(expected))

            expr6 = expr.pivot(rows="id", columns="name", values="fid")
            expr6 = expr6.join(self.odps_df, on="id")[expr6, "name"]
            res = self.engine.execute(expr6)
            result = self._get_result(res)

            expected = [[1, 1.0, 3.0, "name1"], [2, 2.0, None, "name2"], [3, None, 4.0, "name1"]]
            self.assertEqual(sorted(result), sorted(expected))
        finally:
            table.drop()
Ejemplo n.º 59
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid'],
                                    datatypes('string', 'int64', 'float64'))

        table = MockTable(name='pyodps_test_expr_table', schema=schema)

        self.expr = CollectionExpr(_source_data=table, _schema=schema)

        self.engine = ODPSEngine(self.odps)