Ejemplo n.º 1
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.df_schema = schema
        self.schema = df_schema_to_odps_schema(schema)
        self.df = None
        self.expr = None

        self.engine = SQLAlchemyEngine()

        import sqlalchemy
        from sqlalchemy import create_engine

        self.sql_engine = engine = create_engine('postgres://localhost/pyodps')
        # self.sql_engine = engine = create_engine('mysql://localhost/pyodps')
        # self.sql_engine = engine = create_engine('sqlite://')
        self.conn = engine.connect()

        self.metadata = metadata = sqlalchemy.MetaData(bind=engine)
        columns = df_schema_to_sqlalchemy_columns(self.df_schema, engine=self.sql_engine)
        t = sqlalchemy.Table('pyodps_test_data', metadata, *columns)

        metadata.create_all()

        self.table = t
        self.expr = CollectionExpr(_source_data=self.table, _schema=self.df_schema)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
 def _random_values(self):
     values = [self._gen_random_string() if random.random() >= 0.05 else None,
               self._gen_random_bigint(),
               self._gen_random_double() if random.random() >= 0.05 else None,
               self._gen_random_datetime() if random.random() >= 0.05 else None]
     schema = df_schema_to_odps_schema(self.schema)
     return Record(schema=schema, values=values)
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'birth', 'scale'][:5],
            datatypes('string', 'int64', 'float64', 'boolean', 'datetime',
                      'decimal')[:5])
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_%s' % str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = SeahawksEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
 def _random_values(self):
     values = [
         self._gen_random_str(),
         self._gen_random_int64(),
         self._gen_random_float64()
     ]
     schema = df_schema_to_odps_schema(self.schema)
     return Record(schema=schema, values=values)
Ejemplo n.º 5
0
    def testExistingPersist(self):
        self.create_iris(IRIS_TABLE)
        df = DataFrame(self.odps.get_table(IRIS_TABLE)).append_id()

        odps_schema = df_schema_to_odps_schema(df.schema)
        cols = list(reversed(odps_schema.columns))
        odps_schema = Schema.from_lists([c.name for c in cols], [c.type for c in cols])

        self.odps.delete_table(EXISTING_PERSIST_TABLE, if_exists=True)
        self.odps.create_table(EXISTING_PERSIST_TABLE, odps_schema)
        df.persist(EXISTING_PERSIST_TABLE)
Ejemplo n.º 6
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'category', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'string', 'int64', 'float64', 'boolean',
                      'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.data = self._gen_data(20, value_range=(-1000, 1000))
        self.df = pd.DataFrame(self.data, columns=schema.names)
        self.expr = DataFrame(self.df, schema=schema)
    def testLargeColumnsFormatter(self):
        names = list(itertools.chain(*[[name + str(i) for name in self.schema.names] for i in range(10)]))
        types = self.schema.types * 10

        schema = Schema.from_lists(names, types)
        gen_row = lambda: list(itertools.chain(*(self._random_values().values for _ in range(10))))
        data = [Record(schema=df_schema_to_odps_schema(schema), values=gen_row()) for _ in range(10)]

        pd = ResultFrame(data=data, schema=schema, pandas=True)
        result = ResultFrame(data=data, schema=schema, pandas=False)

        self.assertEqual(to_str(repr(pd)), to_str(repr(result)))
        self.assertEqual(to_str(pd._repr_html_()), to_str(result._repr_html_()))
    def testLargeColumnsFormatter(self):
        names = list(itertools.chain(*[[name + str(i) for name in self.schema.names] for i in range(10)]))
        types = self.schema.types * 10

        schema = Schema.from_lists(names, types)
        gen_row = lambda: list(itertools.chain(*(self._random_values().values for _ in range(10))))
        data = [Record(schema=df_schema_to_odps_schema(schema), values=gen_row()) for _ in range(10)]

        pd = ResultFrame(data=data, schema=schema, pandas=True)
        result = ResultFrame(data=data, schema=schema, pandas=False)

        self.assertEqual(to_str(repr(pd)), to_str(repr(result)))
        self.assertEqual(to_str(pd._repr_html_()), to_str(result._repr_html_()))
Ejemplo n.º 9
0
    def testStaticPartition(self):
        self.create_iris(IRIS_TABLE)
        df = DataFrame(self.odps.get_table(IRIS_TABLE))
        id_df = df.append_id()

        src_schema = df_schema_to_odps_schema(id_df.schema)
        schema = Schema(columns=src_schema.simple_columns,
                        partitions=[Partition(name='ds', type=odps_types.string)])
        self.odps.delete_table(STATIC_PART_TABLE, if_exists=True)
        dest_table = self.odps.create_table(STATIC_PART_TABLE, schema, lifecycle=1)

        id_df.persist(STATIC_PART_TABLE, partition='ds=20170314', lifecycle=1)
        self.assertTrue(dest_table.exist_partition('ds=20170314'))
Ejemplo n.º 10
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = tn('pyodps_test_selecter_table_%s' %
                        str(uuid.uuid4()).replace('-', '_'))
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(name=table_name,
                                            schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

            def inc(self, *args, **kwargs):
                pass

            def status(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()

        data = [
            ['name1', 4, 5.3, None, None, None],
            ['name2', 2, 3.5, None, None, None],
            ['name1', 4, 4.2, None, None, None],
            ['name1', 3, 2.2, None, None, None],
            ['name1', 3, 4.1, None, None, None],
        ]

        schema2 = Schema.from_lists(['name', 'id2', 'id3'],
                                    [types.string, types.bigint, types.bigint])

        table_name = tn('pyodps_test_selecter_table2')
        self.odps.delete_table(table_name, if_exists=True)
        table2 = self.odps.create_table(name=table_name, schema=schema2)
        self.expr2 = CollectionExpr(_source_data=table2,
                                    _schema=odps_schema_to_df_schema(schema2))

        self._gen_data(data=data)

        data2 = [['name1', 4, -1], ['name2', 1, -2]]

        self.odps.write_table(table2, 0, data2)

        self.selecter = EngineSelecter()
Ejemplo n.º 11
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = 'pyodps_test_engine_table'
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(
                name='pyodps_test_engine_table', schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
Ejemplo n.º 12
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)
        table_name = 'pyodps_test_engine_table'
        self.odps.delete_table(table_name, if_exists=True)
        self.table = self.odps.create_table(
                name='pyodps_test_engine_table', schema=self.schema)
        self.expr = CollectionExpr(_source_data=self.table, _schema=schema)

        self.engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
Ejemplo n.º 13
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
                                   datatypes('string', 'int64', 'float64', 'boolean', 'decimal', 'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass
        self.faked_bar = FakeBar()
Ejemplo n.º 14
0
    def setup(self):
        datatypes = lambda *types: [validate_data_type(t) for t in types]
        schema = Schema.from_lists(
            ['name', 'id', 'fid', 'isMale', 'scale', 'birth'],
            datatypes('string', 'int64', 'float64', 'boolean', 'decimal',
                      'datetime'))
        self.schema = df_schema_to_odps_schema(schema)

        import pandas as pd
        self.df = pd.DataFrame(None, columns=schema.names)
        self.expr = CollectionExpr(_source_data=self.df, _schema=schema)

        self.engine = PandasEngine(self.odps)
        self.odps_engine = ODPSEngine(self.odps)

        class FakeBar(object):
            def update(self, *args, **kwargs):
                pass

        self.faked_bar = FakeBar()
Ejemplo n.º 15
0
    def mock_action(self, sources, output_desc=1, msg='', action=None):
        try:
            from odps.ml import PmmlModel
        except ImportError:
            PmmlModel = None

        if not isinstance(sources, Iterable):
            sources = [sources, ]

        input_types = [PortType.DATA if isinstance(o, CollectionExpr) else PortType.MODEL for o in sources]

        source_objs = [adapter_from_df(s) if isinstance(s, CollectionExpr) else s for s in sources]
        uplinks = [adapter for adapter in source_objs if isinstance(adapter, DFAdapter)]

        if isinstance(output_desc, six.integer_types):
            output_types = [PortType.DATA for _ in range(output_desc)]
        else:
            output_types = [PortType.DATA if ch == 'd' else PortType.MODEL for ch in output_desc]

        merge_node = MockNode(msg, action, input_types, output_types)
        odps = None
        for idx, o in enumerate(source_objs):
            o._link_node(merge_node, 'input%d' % (1 + idx))
            odps = o._odps
        outputs = []
        for idx, out_type in enumerate(output_types):
            if out_type == PortType.DATA or PmmlModel is None:
                schema = df_schema_to_odps_schema(six.next(s for s in sources if isinstance(s, CollectionExpr)).schema)
                new_df = DataFrame(DFAdapter._build_mock_table('mock_table', schema, self.odps))
                DFAdapter(odps, merge_node.outputs['output%d' % (1 + idx)], new_df, uplink=uplinks)
                outputs.append(new_df)
            else:
                outputs.append(PmmlModel(odps, port=merge_node.outputs['output%d' % (1 + idx)]))
        if len(output_types) == 1:
            return outputs[0]
        else:
            return outputs
 def _random_values(self):
     values = [self._gen_random_str(), self._gen_random_int64(),
               self._gen_random_float64()]
     schema = df_schema_to_odps_schema(self.schema)
     return Record(schema=schema, values=values)