Python StringField Examples

Programming Language: Python

Namespace/Package Name: mars.serialize

Class/Type: StringField

Examples at hotexamples.com: 17

Python StringField - 17 examples found. These are the top rated real world Python examples of mars.serialize.StringField extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

StringField(17)

Frequently Used Methods

StringField (17)

Example #1

Show file

File: datastore.py Project: wjsi/aliyun-odps-python-sdk

class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123460

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _overwrite = BoolField('overwrite')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 unknown_as_string=None,
                 over_write=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTable,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _overwrite=over_write,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def write_batch_size(self):
        return self._write_batch_size

    def __call__(self, x):
        shape = (0, ) * len(x.shape)
        index_value = parse_index(x.index_value.to_pandas()[:0], x.key,
                                  'index')
        columns_value = parse_index(x.columns_value.to_pandas()[:0],
                                    x.key,
                                    'columns',
                                    store_data=True)
        return self.new_dataframe([x],
                                  shape=shape,
                                  dtypes=x.dtypes[:0],
                                  index_value=index_value,
                                  columns_value=columns_value)

    @classmethod
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = build_concatenated_rows_frame(op.inputs[0])
        out_df = op.outputs[0]

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                unknown_as_string=op.unknown_as_string,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_session.handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           index_value=out_df.index_value,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) >= combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           index_value=out_df.index_value,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                    table_name=op.table_name,
                                                    blocks=blocks,
                                                    cupid_handle=to_str(
                                                        upload_session.handle),
                                                    overwrite=op.overwrite,
                                                    odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(
            chunks,
            shape=out_chunk_shape,
            dtypes=op.dtypes,
            index_value=out_df.index_value)

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     index_value=out_df.index_value,
                                     dtypes=out_df.dtypes,
                                     columns_value=out_df.columns_value,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))

    @classmethod
    def _tile_tunnel(cls, op):
        out_df = op.outputs[0]
        in_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        for chunk in in_df.chunks:
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                partition_spec=op.partition_spec)
            index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=(0, 0),
                                           index_value=index_value,
                                           columns_value=out_df.columns_value,
                                           dtypes=out_df.dtypes,
                                           index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        params = out_df.params.copy()
        params.update(
            dict(chunks=out_chunks,
                 nsplits=((0, ) * in_df.chunk_shape[0], (0, ))))
        return new_op.new_tileables([in_df], **params)

    @classmethod
    def tile(cls, op):
        from cupid.runtime import RuntimeContext

        if RuntimeContext.is_context_ready():
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)

Example #2

Show file

class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123461

    _dtypes = SeriesField('dtypes')

    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _cupid_handle = StringField('cupid_handle')
    _block_id = StringField('block_id')
    _write_batch_size = Int64Field('write_batch_size')

    def __init__(self,
                 dtypes=None,
                 table_name=None,
                 partition_spec=None,
                 cupid_handle=None,
                 block_id=None,
                 write_batch_size=None,
                 **kw):
        super(DataFrameWriteTableSplit,
              self).__init__(_dtypes=dtypes,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _cupid_handle=cupid_handle,
                             _block_id=block_id,
                             _write_batch_size=write_batch_size,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def block_id(self):
        return self._block_id

    @property
    def write_batch_size(self):
        return self._write_batch_size

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow as pa
        import pandas as pd
        from ...df.backends.pd.types import pd_to_df_schema
        from cupid.io.table.core import BlockWriter

        to_store_data = ctx[op.inputs[0].key]

        odps_schema = pd_to_df_schema(to_store_data, unknown_as_string=True)
        project_name, table_name = op.table_name.split('.')
        block_writer = BlockWriter(_table_name=table_name,
                                   _project_name=project_name,
                                   _table_schema=odps_schema,
                                   _partition_spec=op.partition_spec,
                                   _block_id=op.block_id,
                                   _handle=op.cupid_handle)
        logger.debug('Start writing table block, block id: %s', op.block_id)
        with block_writer.open_arrow_writer() as cupid_writer:

            sink = pa.BufferOutputStream()

            batch_size = op.write_batch_size or 1024
            schema = pa.RecordBatch.from_pandas(to_store_data[:1],
                                                preserve_index=False).schema
            arrow_writer = pa.RecordBatchStreamWriter(sink, schema)
            batch_idx = 0
            batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                       (batch_idx + 1)]
            while len(batch_data) > 0:
                batch = pa.RecordBatch.from_pandas(batch_data,
                                                   preserve_index=False)
                arrow_writer.write_batch(batch)
                batch_idx += 1
                batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                           (batch_idx + 1)]
            arrow_writer.close()
            cupid_writer.write(sink.getvalue())
        logger.debug('Write table block finished, block id: %s', op.block_id)

        block_writer.commit()
        ctx[op.outputs[0].key] = pd.DataFrame()

Example #3

Show file

class TensorTableCOO(TensorNoInput):
    _op_type_ = OperandDef.TABLE_COO

    _paths = ListField('paths', ValueType.string)
    _dim_cols = ListField('dim_cols', ValueType.string)
    _value_col = StringField('value_col')
    _storage_options = StringField('storage_options')

    def __init__(self, dtype=None, paths=None, dim_cols=None, value_col=None,
                 storage_options=None, sparse=True, **kw):
        super(TensorTableCOO, self).__init__(_paths=paths, _dim_cols=dim_cols, _value_col=value_col,
                                             _dtype=dtype, _storage_options=storage_options,
                                             _sparse=sparse, **kw)

    @property
    def paths(self):
        return self._paths

    @property
    def dim_cols(self):
        return self._dim_cols

    @property
    def value_col(self):
        return self._value_col

    @property
    def storage_options(self):
        return self._storage_options

    @classmethod
    def tile(cls, op):
        tensor = op.outputs[0]

        storage_opts = json.loads(op.storage_options)

        logger.debug('Start scanning data files in %s', op.paths[0])
        chunk_files = dict()
        for key in glob(op.paths[0], **storage_opts):
            file_name, _ = key.rsplit('.', 1)
            _, fn_suffix = file_name.rsplit('/', 1)
            dim_suffix = fn_suffix.rsplit('@', 1)[-1]
            dim_indices = tuple(int(pt) for pt in dim_suffix.split(','))
            if dim_indices not in chunk_files:
                chunk_files[dim_indices] = []
            chunk_files[dim_indices].append(key)
        logger.debug('Finish scanning data files in %s', op.paths[0])

        try:
            target_chunk_size = tensor.params.raw_chunk_size
        except AttributeError:
            target_chunk_size = tensor.extra_params.raw_chunk_size
        chunk_size = decide_chunk_sizes(tensor.shape, target_chunk_size, tensor.dtype.itemsize)
        chunk_size_idxes = (range(len(size)) for size in chunk_size)

        out_chunks = []
        for chunk_shape, chunk_idx in izip(itertools.product(*chunk_size),
                                           itertools.product(*chunk_size_idxes)):
            chunk_op = op.copy().reset_key()
            chunk_op._paths = chunk_files.get(chunk_idx, [])
            out_chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        return new_op.new_tensors(op.inputs, tensor.shape,
                                  nsplits=chunk_size, chunks=out_chunks)

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow.parquet as pq
        import pandas as pd
        import scipy.sparse as sps
        from mars.lib.sparse import SparseNDArray
        from ..io import open as fs_open

        dfs = []
        storage_opts = json.loads(op.storage_options)
        for p in op.paths:
            with fs_open(p, 'rb', **storage_opts) as inp_file:
                f = inp_file.read()
                dfs.append(pq.read_table(BytesIO(f)).to_pandas())

        chunk = op.outputs[0]
        if op.sparse and len(dfs) == 0:
            if len(chunk.shape) == 1:
                csr_array = sps.csr_matrix((chunk.shape[0], 1))
                ctx[chunk.key] = SparseNDArray(csr_array, shape=chunk.shape)
            else:
                csr_array = sps.csr_matrix(chunk.shape)
                ctx[chunk.key] = SparseNDArray(csr_array)
            return

        df_merged = pd.concat(dfs, ignore_index=True)
        dim_arrays = [df_merged[col] for col in op.dim_cols]
        value_array = df_merged[op.value_col].astype(chunk.dtype)
        del df_merged

        if op.sparse:
            if len(chunk.shape) == 1:
                dim_arrays.append(np.zeros((len(dim_arrays[0]))))
                csr_array = sps.csr_matrix((value_array, tuple(dim_arrays)), shape=(chunk.shape[0], 1))
            else:
                csr_array = sps.csr_matrix((value_array, tuple(dim_arrays)), shape=chunk.shape)
            del dim_arrays, value_array
            ctx[chunk.key] = SparseNDArray(csr_array, shape=chunk.shape)
        else:
            arr = np.empty(chunk.shape, dtype=value_array.dtype)
            arr[tuple(dim_arrays)] = value_array
            ctx[chunk.key] = arr

Example #4

Show file

File: datastore.py Project: wjsi/aliyun-odps-python-sdk

class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123461

    _dtypes = SeriesField('dtypes')

    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _cupid_handle = StringField('cupid_handle')
    _block_id = StringField('block_id')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    # for tunnel
    _odps_params = DictField('odps_params')

    def __init__(self,
                 dtypes=None,
                 table_name=None,
                 odps_params=None,
                 partition_spec=None,
                 cupid_handle=None,
                 unknown_as_string=None,
                 block_id=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTableSplit,
              self).__init__(_dtypes=dtypes,
                             _table_name=table_name,
                             _odps_params=odps_params,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _cupid_handle=cupid_handle,
                             _block_id=block_id,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def block_id(self):
        return self._block_id

    @property
    def write_batch_size(self):
        return self._write_batch_size

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        import pyarrow as pa
        import pandas as pd
        from ....df.backends.pd.types import pd_to_df_schema
        from cupid.io.table.core import BlockWriter

        to_store_data = ctx[op.inputs[0].key]

        odps_schema = pd_to_df_schema(to_store_data,
                                      unknown_as_string=op.unknown_as_string)
        project_name, table_name = op.table_name.split('.')
        block_writer = BlockWriter(_table_name=table_name,
                                   _project_name=project_name,
                                   _table_schema=odps_schema,
                                   _partition_spec=op.partition_spec,
                                   _block_id=op.block_id,
                                   _handle=op.cupid_handle)
        logger.debug('Start writing table block, block id: %s', op.block_id)
        with block_writer.open_arrow_writer() as cupid_writer:

            sink = pa.BufferOutputStream()

            batch_size = op.write_batch_size or 1024
            batch_idx = 0
            batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                       (batch_idx + 1)]
            batch_data = convert_pandas_object_to_string(batch_data)
            schema = pa.RecordBatch.from_pandas(to_store_data[:1],
                                                preserve_index=False).schema
            arrow_writer = pa.RecordBatchStreamWriter(sink, schema)
            while len(batch_data) > 0:
                batch = pa.RecordBatch.from_pandas(batch_data,
                                                   preserve_index=False)
                arrow_writer.write_batch(batch)
                batch_idx += 1
                batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                           (batch_idx + 1)]
            arrow_writer.close()
            cupid_writer.write(sink.getvalue())
        logger.debug('Write table block finished, block id: %s', op.block_id)

        block_writer.commit()
        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            upload_session = tunnel.create_upload_session(
                t.name, partition_spec=op.partition_spec)
        else:
            upload_session = tunnel.create_upload_session(t.name)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)
        writer = upload_session.open_arrow_writer(0)
        arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
        writer.write(arrow_rb)
        writer.close()
        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)
        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def execute(cls, ctx, op):
        if op.cupid_handle is not None:
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)

Example #5

Show file

    class RunScript(LearnMergeDictOperand):
        _op_type_ = 743210

        _code = BytesField('code')
        _mode = StringField('mode')
        _command_args = ListField('command_args')
        _world_size = Int32Field('world_size')
        _rank = Int32Field('rank')

        def __init__(self,
                     code=None,
                     mode=None,
                     command_args=None,
                     world_size=None,
                     rank=None,
                     merge=None,
                     output_types=None,
                     **kw):
            super().__init__(_code=code,
                             _mode=mode,
                             _command_args=command_args,
                             _world_size=world_size,
                             _rank=rank,
                             _merge=merge,
                             _output_types=output_types,
                             **kw)
            if self._output_types is None:
                self._output_types = [OutputType.object]

        @property
        def code(self):
            return self._code

        @property
        def mode(self):
            return self._mode

        @property
        def world_size(self):
            return self._world_size

        @property
        def rank(self):
            return self._rank

        @property
        def command_args(self):
            return self._command_args or []

        def __call__(self):
            return self.new_tileable(None)

        @classmethod
        def tile(cls, op):
            out_chunks = []
            for i in range(op.world_size):
                chunk_op = op.copy().reset_key()
                chunk_op._rank = i
                out_chunks.append(chunk_op.new_chunk(None, index=(i, )))

            new_op = op.copy()
            return new_op.new_tileables(
                op.inputs,
                chunks=out_chunks,
                nsplits=(tuple(np.nan for _ in range(len(out_chunks))), ))

        @classmethod
        def _execute_with_subprocess(cls, op, env=None):
            # write source code into a temp file
            fd, filename = tempfile.mkstemp('.py')
            with os.fdopen(fd, 'wb') as f:
                f.write(op.code)
            logger.debug('Write code to temp file.')

            env = env or dict()
            envs = os.environ.copy().update(env)
            try:
                # exec code in a new process
                process = subprocess.Popen([sys.executable, filename] +
                                           op.command_args,
                                           env=envs)
                process.wait()
                if process.returncode != 0:
                    raise RuntimeError('Run script failed')

            finally:
                os.remove(filename)

        @classmethod
        def _execute_with_exec(cls, op, local=None):
            local = local or dict()

            try:
                exec(op.code, local)
            finally:
                sys.stdout.flush()

        @classmethod
        def _set_envs(cls, ctx, op):
            scheduler_address = ctx._scheduler_address
            session_id = ctx._session_id

            # set mars envs
            env = os.environ
            env['MARS_SCHEDULER_ADDRESS'] = str(scheduler_address)
            env['MARS_SESSION_ID'] = str(session_id)
            env['RANK'] = str(op.rank)

        @classmethod
        def _build_locals(cls, ctx, op):
            logger.debug('Start to create mars session.')
            sess = ctx.get_current_session().as_default()

            return dict(session=sess)

        @classmethod
        def execute(cls, ctx, op):
            if op.merge:
                return super().execute(ctx, op)

            old_env = os.environ.copy()
            cls._set_envs(ctx, op)

            try:
                if op.mode == 'spawn':
                    cls._execute_with_subprocess(op)
                elif op.mode == 'exec':
                    cls._execute_with_exec(op,
                                           local=cls._build_locals(ctx, op))
                else:
                    raise TypeError('Unsupported mode {}'.format(op.mode))

                if op.rank == 0:
                    ctx[op.outputs[0].key] = {'status': 'ok'}
                else:
                    ctx[op.outputs[0].key] = {}
            finally:
                os.environ = old_env

Example #6

Show file

File: where.py Project: yyaaa1/mars

class DataFrameWhere(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = opcodes.WHERE

    _input = AnyField('input')
    _cond = AnyField('cond')
    _other = AnyField('other')
    _axis = Int32Field('axis')
    _level = AnyField('level')
    _errors = StringField('errors')
    _try_cast = BoolField('try_cast')
    _replace_true = BoolField('replace_true')

    def __init__(self, input=None, cond=None, other=None,  # pylint: disable=redefined-builtin
                 axis=None, level=None, errors=None, try_cast=None, replace_true=None, **kw):
        super().__init__(_input=input, _cond=cond, _other=other, _axis=axis, _level=level,
                         _errors=errors, _try_cast=try_cast, _replace_true=replace_true, **kw)

    @property
    def input(self):
        return self._input

    @property
    def cond(self):
        return self._cond

    @property
    def other(self):
        return self._other

    @property
    def axis(self):
        return self._axis

    @property
    def level(self):
        return self._level

    @property
    def errors(self):
        return self._errors

    @property
    def try_cast(self):
        return self._try_cast

    @property
    def replace_true(self):
        return self._replace_true

    def __call__(self, df_or_series):
        def _check_input_index(obj, axis=None):
            axis = axis if axis is not None else self.axis
            if isinstance(obj, DATAFRAME_TYPE) \
                    and (
                        df_or_series.columns_value.key != obj.columns_value.key
                        or df_or_series.index_value.key != obj.index_value.key
                    ):
                raise NotImplementedError('Aligning different indices not supported')
            elif isinstance(obj, SERIES_TYPE) \
                    and df_or_series.axes[axis].index_value.key != obj.index_value.key:
                raise NotImplementedError('Aligning different indices not supported')

        _check_input_index(self.cond, axis=0)
        _check_input_index(self.other)

        if isinstance(df_or_series, DATAFRAME_TYPE):
            mock_obj = build_df(df_or_series)
        else:
            mock_obj = build_series(df_or_series)

        if isinstance(self.other, (pd.DataFrame, DATAFRAME_TYPE)):
            mock_other = build_df(self.other)
        elif isinstance(self.other, (pd.Series, SERIES_TYPE)):
            mock_other = build_series(self.other)
        else:
            mock_other = self.other

        result_df = mock_obj.where(np.zeros(mock_obj.shape).astype(bool), other=mock_other,
                                   axis=self.axis, level=self.level, errors=self.errors,
                                   try_cast=self.try_cast)

        inputs = filter_inputs([df_or_series, self.cond, self.other])
        if isinstance(df_or_series, DATAFRAME_TYPE):
            return self.new_dataframe(inputs, shape=df_or_series.shape,
                                      dtypes=result_df.dtypes, index_value=df_or_series.index_value,
                                      columns_value=df_or_series.columns_value)
        else:
            return self.new_series(inputs, shape=df_or_series.shape, name=df_or_series.name,
                                   dtype=result_df.dtype, index_value=df_or_series.index_value)

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        inputs_iter = iter(self._inputs)
        self._input = next(inputs_iter)
        if isinstance(self._cond, (Base, Entity)):
            self._cond = next(inputs_iter)
        if isinstance(self._other, (Base, Entity)):
            self._other = next(inputs_iter)

    @classmethod
    def tile(cls, op: "DataFrameWhere"):
        def rechunk_input(inp, axis=None):
            axis = axis if axis is not None else op.axis
            if isinstance(inp, DATAFRAME_TYPE):
                inp = inp.rechunk(op.input.nsplits)._inplace_tile()
            elif isinstance(inp, SERIES_TYPE):
                inp = inp.rechunk({0: op.input.nsplits[axis]})._inplace_tile()
            return inp

        def get_tiled_chunk(obj, index, axis=None):
            if isinstance(obj, DATAFRAME_TYPE):
                return obj.cix[index[0], index[1]]
            elif isinstance(obj, SERIES_TYPE):
                axis = axis if axis is not None else op.axis
                return obj.cix[index[axis], ]
            else:
                return obj

        # TODO support axis alignment for three objects
        cond = rechunk_input(op.cond, axis=0)
        other = rechunk_input(op.other)

        chunks = []
        for c in op.input.chunks:
            cond_chunk = get_tiled_chunk(cond, c.index, axis=0)
            other_chunk = get_tiled_chunk(other, c.index)

            new_op = op.copy().reset_key()
            new_op._cond = cond_chunk
            new_op._other = other_chunk

            inputs = filter_inputs([c, cond_chunk, other_chunk])
            chunks.append(new_op.new_chunk(inputs, **c.params))

        new_op = op.copy().reset_key()
        return new_op.new_tileables(op.inputs, chunks=chunks, nsplits=op.input.nsplits,
                                    **op.input.params)

    @classmethod
    def execute(cls, ctx, op: "DataFrameWhere"):
        out_obj = op.outputs[0]

        input_data = ctx[op.input.key]
        cond = op.cond
        if isinstance(cond, (Base, Entity)):
            cond = ctx[cond.key]

        other = op.other
        if isinstance(other, (Base, Entity)):
            other = ctx[other.key]

        if op.replace_true:
            ctx[out_obj.key] = input_data.mask(cond, other, axis=op.axis, level=op.level,
                                               errors=op.errors, try_cast=op.try_cast)
        else:
            ctx[out_obj.key] = input_data.where(cond, other, axis=op.axis, level=op.level,
                                                errors=op.errors, try_cast=op.try_cast)

Example #7

Show file

class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')
    _columns = ListField('columns')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 sparse=None,
                 add_offset=True,
                 **kw):
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _add_offset=add_offset,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def add_offset(self):
        return self._add_offset

    def __call__(self, shape, chunk_bytes=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes)

    @classmethod
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                add_offset=op.add_offset,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(idx, 0))
            out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

Example #8

Show file

File: datasource.py Project: wjsi/aliyun-odps-python-sdk

class DataFrameReadTable(_Base):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')
    _columns = ListField('columns')
    _nrows = Int64Field('nrows')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _append_partitions = BoolField('append_partitions')
    _last_modified_time = Int64Field('last_modified_time')
    _with_split_meta_on_tile = BoolField('with_split_meta_on_tile')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 nrows=None,
                 sparse=None,
                 add_offset=True,
                 use_arrow_dtype=None,
                 string_as_binary=None,
                 memory_scale=None,
                 append_partitions=None,
                 last_modified_time=None,
                 with_split_meta_on_tile=False,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _nrows=nrows,
                             _sparse=sparse,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _add_offset=add_offset,
                             _append_partitions=append_partitions,
                             _last_modified_time=last_modified_time,
                             _memory_scale=memory_scale,
                             _with_split_meta_on_tile=with_split_meta_on_tile,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def nrows(self):
        return self._nrows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def add_offset(self):
        return self._add_offset

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def with_split_meta_on_tile(self):
        return self._with_split_meta_on_tile

    def get_columns(self):
        return self._columns

    def set_pruned_columns(self, columns):
        self._columns = columns

    def __call__(self, shape, chunk_bytes=None, chunk_size=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes,
                                  chunk_size=chunk_size)

    @classmethod
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.errors import CupidError
        from mars.context import get_context

        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        chunk_idx = 0

        for data_src in data_srcs:
            try:
                data_store_size = data_src.size
            except ODPSError:
                # fail to get data size, just ignore
                pass
            else:
                if data_store_size < split_size and mars_context is not None:
                    # get worker counts
                    worker_count = max(
                        len(mars_context.get_worker_addresses()), 1)
                    # data is too small, split as many as number of cores
                    split_size = data_store_size // worker_count
                    # at least 1M
                    split_size = max(split_size, 1 * 1024**2)
                    logger.debug(
                        'Input data size is too small, split_size is %s',
                        split_size)

            logger.debug(
                'Start creating download session of table %s from cupid, '
                'columns: %s', op.table_name, op.columns)
            while True:
                try:
                    download_session = cupid_session.create_download_session(
                        data_src,
                        split_size=split_size,
                        columns=op.columns,
                        with_split_meta=op.with_split_meta_on_tile)
                    break
                except CupidError:
                    logger.debug(
                        'The number of splits exceeds 100000, split_size is %s',
                        split_size)
                    if split_size >= MAX_CHUNK_SIZE:
                        raise
                    else:
                        split_size *= 2

            logger.debug('%s table splits have been created.',
                         str(len(download_session.splits)))

            meta_chunk_rows = [
                split.meta_row_count for split in download_session.splits
            ]
            if np.isnan(out_shape[0]):
                est_chunk_rows = meta_chunk_rows
            else:
                sp_file_sizes = np.array([
                    sp.split_file_end - sp.split_file_start
                    for sp in download_session.splits
                ])
                total_size = sp_file_sizes.sum()
                ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                    total_size).tolist()
                est_chunk_rows = [
                    mr if mr is not None else rr
                    for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                ]

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            logger.warning('Estimated chunk rows: %r', est_chunk_rows)

            if len(download_session.splits) == 0:
                logger.debug('Table %s has no data', op.table_name)
                chunk_op = DataFrameReadTableSplit()
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, out_shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(chunk_idx, 0))
                out_chunks.append(out_chunk)
                chunk_idx += 1
            else:
                for idx, split in enumerate(download_session.splits):
                    chunk_op = DataFrameReadTableSplit(
                        cupid_handle=to_str(split.handle),
                        split_index=split.split_index,
                        split_file_start=split.split_file_start,
                        split_file_end=split.split_file_end,
                        schema_file_start=split.schema_file_start,
                        schema_file_end=split.schema_file_end,
                        add_offset=op.add_offset,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        string_as_binary=op.string_as_binary,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=est_chunk_rows[idx],
                        partition_spec=partition_spec,
                        append_partitions=op.append_partitions,
                        meta_raw_size=split.meta_raw_size,
                        nrows=meta_chunk_rows[idx] or op.nrows,
                        memory_scale=op.memory_scale)
                    # the chunk shape is unknown
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    chunk_idx += 1
                    out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        for data_src in data_srcs:
            data_store_size = data_src.size
            shape = out_shape
            chunk_size = df.extra_params.chunk_size

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            if chunk_size is None:
                chunk_bytes = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT
                chunk_count = data_store_size // chunk_bytes + (
                    data_store_size % chunk_bytes != 0)
                chunk_size = ceildiv(shape[0], chunk_count)
                split_size = chunk_bytes
            else:
                chunk_count = ceildiv(shape[0], chunk_size)
                split_size = data_store_size // chunk_count

            for i in range(chunk_count):
                start_index = chunk_size * i
                end_index = min(chunk_size * (i + 1), shape[0])
                row_size = end_index - start_index
                chunk_op = DataFrameReadTableSplit(
                    table_name=op.table_name,
                    partition_spec=partition_spec,
                    start_index=start_index,
                    end_index=end_index,
                    nrows=op.nrows,
                    odps_params=op.odps_params,
                    columns=op.columns,
                    add_offset=op.add_offset,
                    dtypes=out_dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=row_size,
                    append_partitions=op.append_partitions,
                    memory_scale=op.memory_scale)
                index_value = parse_index(pd.RangeIndex(
                    start_index, end_index))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(row_size, out_shape[1]),
                                               dtypes=out_dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(index_start + i, 0))
                row_nsplits.append(row_size)
                out_chunks.append(out_chunk)

            index_start += chunk_count

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile(cls, op):
        from cupid.runtime import RuntimeContext

        if RuntimeContext.is_context_ready():
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)

    if not head_can_be_opt:
        tile = _tile

Example #9

Show file

File: datasource.py Project: YangXinNewlife/aliyun-odps-python-sdk

class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')
    _columns = ListField('columns')
    _nrows = Int64Field('nrows')
    _use_arrow_dtype = BoolField('use_arrow_dtype')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 nrows=None,
                 sparse=None,
                 add_offset=True,
                 use_arrow_dtype=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _nrows=nrows,
                             _sparse=sparse,
                             _use_arrow_dtype=use_arrow_dtype,
                             _add_offset=add_offset,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def nrows(self):
        return self._nrows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def add_offset(self):
        return self._add_offset

    def __call__(self, shape, chunk_bytes=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes)

    @classmethod
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.context import get_context

        cupid_ctx = context()
        if cupid_ctx is None:
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        try:
            data_store_size = data_src.size
        except ODPSError:
            # fail to get data size, just ignore
            pass
        else:
            if data_store_size < split_size and mars_context is not None:
                # get worker counts
                worker_count = max(len(mars_context.get_worker_addresses()), 1)
                # data is too small, split as many as number of cores
                split_size = data_store_size // worker_count
                # at least 1M
                split_size = max(split_size, 1 * 1024**2)
                logger.debug(
                    'Input data size is too small, split_size is {}'.format(
                        split_size))

        logger.debug(
            'Start creating download session of table {} from cupid.'.format(
                op.table_name))
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        if np.isnan(df.shape[0]):
            est_chunk_rows = [None] * len(download_session.splits)
        else:
            sp_file_sizes = np.array([
                sp.split_file_end - sp.split_file_start
                for sp in download_session.splits
            ])
            total_size = sp_file_sizes.sum()
            est_chunk_rows = sp_file_sizes * df.shape[0] // total_size

        logger.warning('Estimated chunk rows: %r', est_chunk_rows)

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        if len(download_session.splits) == 0:
            logger.debug('Table {} has no data'.format(op.table_name))
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(0, 0))
            out_chunks = [out_chunk]
        else:
            for idx, split in enumerate(download_session.splits):
                chunk_op = DataFrameReadTableSplit(
                    cupid_handle=to_str(split.handle),
                    split_index=split.split_index,
                    split_file_start=split.split_file_start,
                    split_file_end=split.split_file_end,
                    schema_file_start=split.schema_file_start,
                    schema_file_end=split.schema_file_end,
                    add_offset=op.add_offset,
                    dtypes=op.dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=est_chunk_rows[idx])
                # the chunk shape is unknown
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(df.dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, df.shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(idx, 0))
                out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

Example #10

Show file

class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123451

    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 nrows=None,
                 dtypes=None,
                 sparse=None,
                 **kw):
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _nrows=nrows,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def nrows(self):
        return self._nrows

    @property
    def dtypes(self):
        return self._dtypes

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is not None:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            data = pa.Table.from_batches(batches).to_pandas()[:op.nrows]
        else:
            arrow_table = reader.read_all()
            data = arrow_table.to_pandas()
        logger.debug("Read data size is %s",
                     data.memory_usage(deep=True).sum())
        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        ctx[op.outputs[0].key] = data

Example #11

Show file

File: datasource.py Project: YangXinNewlife/aliyun-odps-python-sdk

class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123451

    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')

    _split_size = Int64Field('split_size')
    _estimate_rows = Int64Field('estimate_rows')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 nrows=None,
                 dtypes=None,
                 split_size=None,
                 use_arrow_dtype=None,
                 estimate_rows=None,
                 sparse=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _use_arrow_dtype=use_arrow_dtype,
                             _nrows=nrows,
                             _estimate_rows=estimate_rows,
                             _split_size=split_size,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def nrows(self):
        return self._nrows

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def split_size(self):
        return self._split_size

    @property
    def estimate_rows(self):
        return self._estimate_rows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @classmethod
    def estimate_size(cls, ctx, op):
        import numpy as np

        def is_object_dtype(dtype):
            try:
                return np.issubdtype(dtype, np.object_) \
                       or np.issubdtype(dtype, np.unicode_) \
                       or np.issubdtype(dtype, np.bytes_)
            except TypeError:  # pragma: no cover
                return False

        if op.split_size is None:
            ctx[op.outputs[0].key] = (0, 0)
            return

        arrow_size = ORC_COMPRESSION_RATIO * op.split_size
        n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)])
        if op.estimate_rows or op.nrows:
            rows = op.nrows if op.nrows is not None else op.estimate_rows
            pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD
            logger.debug('Estimate pandas memory cost: %r', pd_size)
        else:
            pd_size = arrow_size * 10 if n_strings else arrow_size

        ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size)

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        if op.cupid_handle is None:
            empty_df = pd.DataFrame()
            for name, dtype in op.outputs[0].dtypes.items():
                empty_df[name] = pd.Series(dtype=dtype)
            ctx[op.outputs[0].key] = empty_df
            return

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is not None:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            data = arrow_table_to_pandas_dataframe(
                pa.Table.from_batches(batches),
                use_arrow_dtype=op.use_arrow_dtype)[:op.nrows]
        else:
            arrow_table = reader.read_all()
            data = arrow_table_to_pandas_dataframe(
                arrow_table, use_arrow_dtype=op.use_arrow_dtype)
        data_columns = data.dtypes.index
        expected_columns = op.outputs[0].dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug("Data columns differs from output columns, "
                         "data columns: {}, output columns: {}".format(
                             data_columns, expected_columns))
            data.columns = expected_columns

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is {}, size is {}'.format(
            data.shape,
            data.memory_usage(deep=True).sum()))
        ctx[op.outputs[0].key] = data

Example #12

Show file

class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 dtypes=None,
                 sparse=None,
                 add_offset=True,
                 **kw):
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _add_offset=add_offset,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def add_offset(self):
        return self._add_offset

    def __call__(self, shape, chunk_store_limit=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_store_limit=chunk_store_limit)

    @classmethod
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        download_session = cupid_session.create_download_session(
            data_src, split_size=split_size)
        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        out_count_chunks = []
        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk, out_count_chunk = chunk_op.new_chunks(
                None,
                kws=[{
                    'shape': (np.nan, df.shape[1]),
                    'dtypes': op.dtypes,
                    'index_value': index_value,
                    'columns_value': columns_value,
                    'index': (idx, )
                }, {
                    'shape': (1, ),
                    'index': (idx, )
                }])
            out_chunks.append(out_chunk)
            out_count_chunks.append(out_count_chunk)

        if op.add_offset:
            output_chunks = []
            for i, chunk in enumerate(out_chunks):
                if i == 0:
                    output_chunks.append(chunk)
                    continue
                counts = out_count_chunks[:i]
                inputs = [chunk] + counts
                output_chunk = DataFrameReadTableWithOffset(
                    dtypes=chunk.dtypes).new_chunk(
                        inputs,
                        shape=chunk.shape,
                        index=chunk.index,
                        dtypes=chunk.dtypes,
                        index_value=chunk.index_value,
                        columns_value=chunk.columns_value)
                output_chunks.append(output_chunk)
        else:
            output_chunks = out_chunks

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=output_chunks,
                                     nsplits=nsplits)

Example #13

Show file

class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123451

    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _dtypes = SeriesField('dtypes')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 dtypes=None,
                 sparse=None,
                 **kw):
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def output_limit(self):
        return 2

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def dtypes(self):
        return self._dtypes

    @classmethod
    def execute(cls, ctx, op):
        import numpy as np
        from cupid.io.table import TableSplit

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        data = reader.read_all().to_pandas()
        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        count = np.array([data.shape[0]])
        data_chunk, count_chunk = op.outputs
        ctx[data_chunk.key] = data
        ctx[count_chunk.key] = count

Example #14

Show file

class DataFrameWriteTableCommit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123462

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _overwrite = BoolField('overwrite')
    _blocks = DictField('blocks')
    _cupid_handle = StringField('cupid_handle')
    _is_terminal = BoolField('is_terminal')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 blocks=None,
                 cupid_handle=None,
                 overwrite=False,
                 is_terminal=None,
                 **kw):
        super(DataFrameWriteTableCommit,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _blocks=blocks,
                             _overwrite=overwrite,
                             _cupid_handle=cupid_handle,
                             _is_terminal=is_terminal,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def blocks(self):
        return self._blocks

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def is_terminal(self):
        return self._is_terminal

    @classmethod
    def execute(cls, ctx, op):
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.io.table import CupidTableUploadSession

        if op.is_terminal:
            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            o = ODPS(None, None, account=account, **op.odps_params)
            cupid_session = CupidSession(o)

            project_name, table_name = op.table_name.split('.')
            upload_session = CupidTableUploadSession(session=cupid_session,
                                                     table_name=table_name,
                                                     project_name=project_name,
                                                     handle=op.cupid_handle,
                                                     blocks=op.blocks)
            upload_session.commit(overwrite=op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()

Example #15

Show file

class TensorStoreCOO(TensorDataStore):
    _op_type_ = OperandDef.STORE_COO

    _input = KeyField('input')
    _path = StringField('path')
    _dim_cols = ListField('dim_cols', ValueType.string)
    _value_col = StringField('value_col')
    _storage_options = StringField('storage_options')
    _global_index = BoolField('global_index', default=False)
    _axis_offsets = TupleField('axis_offsets')

    def __init__(self, dtype=None, path=None, dim_cols=None, value_col=None,
                 storage_options=None, sparse=True, global_index=False, **kw):
        super(TensorStoreCOO, self).__init__(_path=path, _dim_cols=dim_cols, _value_col=value_col,
                                             _dtype=dtype, _storage_options=storage_options,
                                             _global_index=global_index, _sparse=sparse, **kw)

    @property
    def input(self):
        return self._input

    @property
    def path(self):
        return self._path

    @property
    def dim_cols(self):
        return self._dim_cols

    @property
    def value_col(self):
        return self._value_col

    @property
    def storage_options(self):
        return self._storage_options

    @property
    def global_index(self):
        return self._global_index

    @property
    def axis_offsets(self):
        return self._axis_offsets

    def _set_inputs(self, inputs):
        super(TensorStoreCOO, self)._set_inputs(inputs)
        self._input = self._inputs[0]

    def calc_shape(self, *inputs_shape):
        return (0,) * len(inputs_shape[0])

    @classmethod
    def tile(cls, op):
        in_tensor = op.input

        out_chunks = []
        out_chunk_shape = (0,) * in_tensor.ndim
        axis_offsets = [[0] + np.cumsum(ns)[:-1].tolist() for ns in in_tensor.nsplits]
        for chunk in in_tensor.chunks:
            chunk_op = op.copy().reset_key()
            chunk_path = '%s/%s.parquet' % (
                chunk_op.path, ','.join(str(j) for j in chunk.index))
            chunk_op._path = chunk_path
            chunk_op._axis_offsets = \
                tuple(axis_offsets[axis][idx] for axis, idx in enumerate(chunk.index))
            out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        return new_op.new_tensors(op.inputs, op.outputs[0].shape,
                                  chunks=out_chunks,
                                  nsplits=((0,) * len(ns) for ns in in_tensor.nsplits))

    @classmethod
    def execute(cls, ctx, op):
        import numpy as np
        import pandas as pd
        import pyarrow as pa
        import pyarrow.parquet as pq
        from ..io import open as fs_open

        to_store_data = ctx[op.inputs[0].key]
        storage_opts = json.loads(op.storage_options)
        axis_offsets = op.axis_offsets
        store_global_index = op.global_index
        dim_cols = op.dim_cols
        col_to_array = {}

        if isinstance(to_store_data, SparseNDArray):
            # sparse, convert to coo matrix
            matrix = to_store_data.raw.tocoo(copy=False)
            ndim = matrix.ndim

            if len(dim_cols) > 1:
                col_to_array[dim_cols[0]] = matrix.row
                if store_global_index:
                    # global index
                    col_to_array['global_' + dim_cols[0]] = matrix.row + axis_offsets[0]
                col_to_array[dim_cols[1]] = matrix.col
                if store_global_index:
                    col_to_array['global_' + dim_cols[1]] = matrix.col + axis_offsets[1]
            else:
                col_to_array[dim_cols[0]] = matrix.col
                if store_global_index:
                    col_to_array['global_' + dim_cols[0]] = matrix.col + axis_offsets[0]

            col_to_array[op.value_col] = matrix.data
        else:
            # dense, convert to numpy array
            arr = as_np_array(to_store_data)
            ndim = arr.ndim

            index = np.array(np.meshgrid(*[np.arange(s) for s in arr.shape])).T.reshape(-1, arr.ndim).T
            for j, col, ind in zip(range(len(dim_cols)), dim_cols, index):
                col_to_array[col] = ind
                if store_global_index:
                    col_to_array['global_' + col] = ind + axis_offsets[j]
            col_to_array[op.value_col] = arr.ravel()

        df = pd.DataFrame(col_to_array)
        if len(op.dim_cols) > ndim:
            for col in op.dim_cols[ndim:]:
                df[col] = None
        table = pa.Table.from_pandas(df)
        bio = BytesIO()
        pq.write_table(table, bio)
        bio.seek(0)

        # write oss
        with fs_open(op.path, 'wb', **storage_opts) as out_file:
            out_file.write(bio.read())

        ctx[op.outputs[0].key] = np.empty((0,) * to_store_data.ndim)

Example #16

Show file

class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123460

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _overwrite = BoolField('overwrite')
    _write_batch_size = Int64Field('write_batch_size')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 over_write=None,
                 write_batch_size=None,
                 **kw):
        super(DataFrameWriteTable,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _overwrite=over_write,
                             _write_batch_size=write_batch_size,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def write_batch_size(self):
        return self._write_batch_size

    def __call__(self, x):
        shape = (0, ) * len(x.shape)
        return self.new_dataframe([x], shape=shape)

    @classmethod
    def tile(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.dataframe.utils import build_concatenated_rows_frame

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_session.handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) > combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                    table_name=op.table_name,
                                                    blocks=blocks,
                                                    cupid_handle=to_str(
                                                        upload_session.handle),
                                                    overwrite=op.overwrite,
                                                    odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(chunks,
                                                       shape=out_chunk_shape,
                                                       dtypes=op.dtypes)

        out_df = op.outputs[0]
        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     dtypes=out_df.dtypes,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))

Example #17

Show file

File: datasource.py Project: wjsi/aliyun-odps-python-sdk

class DataFrameReadTableSplit(_Base):
    _op_type_ = 123451

    # for cupid
    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')

    # for tunnel
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _start_index = Int64Field('start_index')
    _end_index = Int64Field('end_index')
    _odps_params = DictField('odps_params')
    _columns = ListField('columns')

    _split_size = Int64Field('split_size')
    _append_partitions = BoolField('append_partitions')
    _estimate_rows = Int64Field('estimate_rows')
    _meta_raw_size = Int64Field('meta_raw_size')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 table_name=None,
                 partition_spec=None,
                 start_index=None,
                 end_index=None,
                 odps_params=None,
                 columns=None,
                 nrows=None,
                 dtypes=None,
                 string_as_binary=None,
                 split_size=None,
                 use_arrow_dtype=None,
                 memory_scale=None,
                 estimate_rows=None,
                 meta_raw_size=None,
                 append_partitions=None,
                 sparse=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _start_index=start_index,
                             _end_index=end_index,
                             _odps_params=odps_params,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _nrows=nrows,
                             _estimate_rows=estimate_rows,
                             _split_size=split_size,
                             _dtypes=dtypes,
                             _append_partitions=append_partitions,
                             _sparse=sparse,
                             _meta_raw_size=meta_raw_size,
                             _memory_scale=memory_scale,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def start_index(self):
        return self._start_index

    @property
    def end_index(self):
        return self._end_index

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def columns(self):
        return self._columns

    @property
    def nrows(self):
        return self._nrows

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def split_size(self):
        return self._split_size

    @property
    def estimate_rows(self):
        return self._estimate_rows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def meta_raw_size(self):
        return self._meta_raw_size

    @classmethod
    def estimate_size(cls, ctx, op):
        import numpy as np

        def is_object_dtype(dtype):
            try:
                return np.issubdtype(dtype, np.object_) \
                       or np.issubdtype(dtype, np.unicode_) \
                       or np.issubdtype(dtype, np.bytes_)
            except TypeError:  # pragma: no cover
                return False

        if op.split_size is None:
            ctx[op.outputs[0].key] = (0, 0)
            return

        arrow_size = (op.memory_scale or ORC_COMPRESSION_RATIO) * op.split_size
        if op.meta_raw_size is not None:
            raw_arrow_size = (op.memory_scale or 1) * op.meta_raw_size
            arrow_size = max(arrow_size, raw_arrow_size)

        n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)])
        if op.estimate_rows or op.nrows:
            rows = op.nrows if op.nrows is not None else op.estimate_rows
            pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD
            logger.debug('Estimate pandas memory cost: %r', pd_size)
        else:
            pd_size = arrow_size * 10 if n_strings else arrow_size

        ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size)

    @classmethod
    def _cast_string_to_binary(cls, arrow_table):
        import pyarrow as pa

        new_schema = []
        for field in arrow_table.schema:
            if field.type == pa.string():
                new_schema.append(pa.field(field.name, pa.binary()))
            else:
                new_schema.append(field)

        return arrow_table.cast(pa.schema(new_schema))

    @classmethod
    def _append_partition_values(cls, arrow_table, op):
        import pyarrow as pa

        if op.append_partitions and op.partition_spec:
            from odps.types import PartitionSpec
            spec = PartitionSpec(op.partition_spec)

            for col_name, pt_val in spec.items():
                arrow_table = arrow_table.append_column(
                    col_name,
                    pa.array([pt_val] * arrow_table.num_rows, pa.string()))

        return arrow_table

    @staticmethod
    def _align_columns(data, expected_dtypes):
        data_columns = data.dtypes.index
        expected_columns = expected_dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug(
                "Data columns differs from output columns, "
                "data columns: %s, output columns: %s", data_columns,
                expected_columns)
            data.columns = expected_columns[:len(data.columns)]
            for extra_col in expected_columns[len(data.columns):]:
                data[extra_col] = pd.Series([],
                                            dtype=expected_dtypes[extra_col])
            if not data.dtypes.index.equals(expected_columns):
                data = data[expected_columns]
        return data

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        out = op.outputs[0]

        if op.cupid_handle is None:
            empty_df = pd.DataFrame()
            for name, dtype in out.dtypes.items():
                empty_df[name] = pd.Series(dtype=dtype)
            ctx[out.key] = empty_df
            return

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is None:
            arrow_table = reader.read_all()
        else:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            arrow_table = pa.Table.from_batches(batches)

        arrow_table = cls._append_partition_values(arrow_table, op)

        if op.string_as_binary:
            arrow_table = cls._cast_string_to_binary(arrow_table)
        data = arrow_table_to_pandas_dataframe(
            arrow_table, use_arrow_dtype=op.use_arrow_dtype)
        if op.nrows is not None:
            data = data[:op.nrows]

        data = cls._align_columns(data, out.dtypes)

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is %s, size is %s', data.shape,
                     data.memory_usage(deep=True).sum())
        ctx[out.key] = data

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            download_session = tunnel.create_download_session(
                t.name, partition_spec=op.partition_spec)
        else:
            download_session = tunnel.create_download_session(t.name)
        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        with download_session.open_arrow_reader(op.start_index,
                                                count,
                                                columns=op.columns) as reader:
            table = reader.read()

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)

        data = cls._align_columns(data, op.outputs[0].dtypes)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data

    @classmethod
    def execute(cls, ctx, op):
        from cupid.runtime import RuntimeContext

        if RuntimeContext.is_context_ready():
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)