Example #1
0
class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123460

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _overwrite = BoolField('overwrite')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 unknown_as_string=None,
                 over_write=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTable,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _overwrite=over_write,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def write_batch_size(self):
        return self._write_batch_size

    def __call__(self, x):
        shape = (0, ) * len(x.shape)
        index_value = parse_index(x.index_value.to_pandas()[:0], x.key,
                                  'index')
        columns_value = parse_index(x.columns_value.to_pandas()[:0],
                                    x.key,
                                    'columns',
                                    store_data=True)
        return self.new_dataframe([x],
                                  shape=shape,
                                  dtypes=x.dtypes[:0],
                                  index_value=index_value,
                                  columns_value=columns_value)

    @classmethod
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.runtime import RuntimeContext

        if not RuntimeContext.is_context_ready():
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )
        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = build_concatenated_rows_frame(op.inputs[0])
        out_df = op.outputs[0]

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                unknown_as_string=op.unknown_as_string,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_session.handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           index_value=out_df.index_value,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) >= combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           index_value=out_df.index_value,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                    table_name=op.table_name,
                                                    blocks=blocks,
                                                    cupid_handle=to_str(
                                                        upload_session.handle),
                                                    overwrite=op.overwrite,
                                                    odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(
            chunks,
            shape=out_chunk_shape,
            dtypes=op.dtypes,
            index_value=out_df.index_value)

        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     index_value=out_df.index_value,
                                     dtypes=out_df.dtypes,
                                     columns_value=out_df.columns_value,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))

    @classmethod
    def _tile_tunnel(cls, op):
        out_df = op.outputs[0]
        in_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        for chunk in in_df.chunks:
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                partition_spec=op.partition_spec)
            index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=(0, 0),
                                           index_value=index_value,
                                           columns_value=out_df.columns_value,
                                           dtypes=out_df.dtypes,
                                           index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        params = out_df.params.copy()
        params.update(
            dict(chunks=out_chunks,
                 nsplits=((0, ) * in_df.chunk_shape[0], (0, ))))
        return new_op.new_tileables([in_df], **params)

    @classmethod
    def tile(cls, op):
        from cupid.runtime import RuntimeContext

        if RuntimeContext.is_context_ready():
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)
Example #2
0
class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123461

    _dtypes = SeriesField('dtypes')

    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _cupid_handle = StringField('cupid_handle')
    _block_id = StringField('block_id')
    _write_batch_size = Int64Field('write_batch_size')

    def __init__(self,
                 dtypes=None,
                 table_name=None,
                 partition_spec=None,
                 cupid_handle=None,
                 block_id=None,
                 write_batch_size=None,
                 **kw):
        super(DataFrameWriteTableSplit,
              self).__init__(_dtypes=dtypes,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _cupid_handle=cupid_handle,
                             _block_id=block_id,
                             _write_batch_size=write_batch_size,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def block_id(self):
        return self._block_id

    @property
    def write_batch_size(self):
        return self._write_batch_size

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow as pa
        import pandas as pd
        from ...df.backends.pd.types import pd_to_df_schema
        from cupid.io.table.core import BlockWriter

        to_store_data = ctx[op.inputs[0].key]

        odps_schema = pd_to_df_schema(to_store_data, unknown_as_string=True)
        project_name, table_name = op.table_name.split('.')
        block_writer = BlockWriter(_table_name=table_name,
                                   _project_name=project_name,
                                   _table_schema=odps_schema,
                                   _partition_spec=op.partition_spec,
                                   _block_id=op.block_id,
                                   _handle=op.cupid_handle)
        logger.debug('Start writing table block, block id: %s', op.block_id)
        with block_writer.open_arrow_writer() as cupid_writer:

            sink = pa.BufferOutputStream()

            batch_size = op.write_batch_size or 1024
            schema = pa.RecordBatch.from_pandas(to_store_data[:1],
                                                preserve_index=False).schema
            arrow_writer = pa.RecordBatchStreamWriter(sink, schema)
            batch_idx = 0
            batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                       (batch_idx + 1)]
            while len(batch_data) > 0:
                batch = pa.RecordBatch.from_pandas(batch_data,
                                                   preserve_index=False)
                arrow_writer.write_batch(batch)
                batch_idx += 1
                batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                           (batch_idx + 1)]
            arrow_writer.close()
            cupid_writer.write(sink.getvalue())
        logger.debug('Write table block finished, block id: %s', op.block_id)

        block_writer.commit()
        ctx[op.outputs[0].key] = pd.DataFrame()
Example #3
0
class TensorTableCOO(TensorNoInput):
    _op_type_ = OperandDef.TABLE_COO

    _paths = ListField('paths', ValueType.string)
    _dim_cols = ListField('dim_cols', ValueType.string)
    _value_col = StringField('value_col')
    _storage_options = StringField('storage_options')

    def __init__(self, dtype=None, paths=None, dim_cols=None, value_col=None,
                 storage_options=None, sparse=True, **kw):
        super(TensorTableCOO, self).__init__(_paths=paths, _dim_cols=dim_cols, _value_col=value_col,
                                             _dtype=dtype, _storage_options=storage_options,
                                             _sparse=sparse, **kw)

    @property
    def paths(self):
        return self._paths

    @property
    def dim_cols(self):
        return self._dim_cols

    @property
    def value_col(self):
        return self._value_col

    @property
    def storage_options(self):
        return self._storage_options

    @classmethod
    def tile(cls, op):
        tensor = op.outputs[0]

        storage_opts = json.loads(op.storage_options)

        logger.debug('Start scanning data files in %s', op.paths[0])
        chunk_files = dict()
        for key in glob(op.paths[0], **storage_opts):
            file_name, _ = key.rsplit('.', 1)
            _, fn_suffix = file_name.rsplit('/', 1)
            dim_suffix = fn_suffix.rsplit('@', 1)[-1]
            dim_indices = tuple(int(pt) for pt in dim_suffix.split(','))
            if dim_indices not in chunk_files:
                chunk_files[dim_indices] = []
            chunk_files[dim_indices].append(key)
        logger.debug('Finish scanning data files in %s', op.paths[0])

        try:
            target_chunk_size = tensor.params.raw_chunk_size
        except AttributeError:
            target_chunk_size = tensor.extra_params.raw_chunk_size
        chunk_size = decide_chunk_sizes(tensor.shape, target_chunk_size, tensor.dtype.itemsize)
        chunk_size_idxes = (range(len(size)) for size in chunk_size)

        out_chunks = []
        for chunk_shape, chunk_idx in izip(itertools.product(*chunk_size),
                                           itertools.product(*chunk_size_idxes)):
            chunk_op = op.copy().reset_key()
            chunk_op._paths = chunk_files.get(chunk_idx, [])
            out_chunk = chunk_op.new_chunk(None, shape=chunk_shape, index=chunk_idx)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        return new_op.new_tensors(op.inputs, tensor.shape,
                                  nsplits=chunk_size, chunks=out_chunks)

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow.parquet as pq
        import pandas as pd
        import scipy.sparse as sps
        from mars.lib.sparse import SparseNDArray
        from ..io import open as fs_open

        dfs = []
        storage_opts = json.loads(op.storage_options)
        for p in op.paths:
            with fs_open(p, 'rb', **storage_opts) as inp_file:
                f = inp_file.read()
                dfs.append(pq.read_table(BytesIO(f)).to_pandas())

        chunk = op.outputs[0]
        if op.sparse and len(dfs) == 0:
            if len(chunk.shape) == 1:
                csr_array = sps.csr_matrix((chunk.shape[0], 1))
                ctx[chunk.key] = SparseNDArray(csr_array, shape=chunk.shape)
            else:
                csr_array = sps.csr_matrix(chunk.shape)
                ctx[chunk.key] = SparseNDArray(csr_array)
            return

        df_merged = pd.concat(dfs, ignore_index=True)
        dim_arrays = [df_merged[col] for col in op.dim_cols]
        value_array = df_merged[op.value_col].astype(chunk.dtype)
        del df_merged

        if op.sparse:
            if len(chunk.shape) == 1:
                dim_arrays.append(np.zeros((len(dim_arrays[0]))))
                csr_array = sps.csr_matrix((value_array, tuple(dim_arrays)), shape=(chunk.shape[0], 1))
            else:
                csr_array = sps.csr_matrix((value_array, tuple(dim_arrays)), shape=chunk.shape)
            del dim_arrays, value_array
            ctx[chunk.key] = SparseNDArray(csr_array, shape=chunk.shape)
        else:
            arr = np.empty(chunk.shape, dtype=value_array.dtype)
            arr[tuple(dim_arrays)] = value_array
            ctx[chunk.key] = arr
Example #4
0
class DataFrameWriteTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123461

    _dtypes = SeriesField('dtypes')

    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _cupid_handle = StringField('cupid_handle')
    _block_id = StringField('block_id')
    _write_batch_size = Int64Field('write_batch_size')
    _unknown_as_string = BoolField('unknown_as_string')

    # for tunnel
    _odps_params = DictField('odps_params')

    def __init__(self,
                 dtypes=None,
                 table_name=None,
                 odps_params=None,
                 partition_spec=None,
                 cupid_handle=None,
                 unknown_as_string=None,
                 block_id=None,
                 write_batch_size=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameWriteTableSplit,
              self).__init__(_dtypes=dtypes,
                             _table_name=table_name,
                             _odps_params=odps_params,
                             _partition_spec=partition_spec,
                             _unknown_as_string=unknown_as_string,
                             _cupid_handle=cupid_handle,
                             _block_id=block_id,
                             _write_batch_size=write_batch_size,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def unknown_as_string(self):
        return self._unknown_as_string

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def block_id(self):
        return self._block_id

    @property
    def write_batch_size(self):
        return self._write_batch_size

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        import pyarrow as pa
        import pandas as pd
        from ....df.backends.pd.types import pd_to_df_schema
        from cupid.io.table.core import BlockWriter

        to_store_data = ctx[op.inputs[0].key]

        odps_schema = pd_to_df_schema(to_store_data,
                                      unknown_as_string=op.unknown_as_string)
        project_name, table_name = op.table_name.split('.')
        block_writer = BlockWriter(_table_name=table_name,
                                   _project_name=project_name,
                                   _table_schema=odps_schema,
                                   _partition_spec=op.partition_spec,
                                   _block_id=op.block_id,
                                   _handle=op.cupid_handle)
        logger.debug('Start writing table block, block id: %s', op.block_id)
        with block_writer.open_arrow_writer() as cupid_writer:

            sink = pa.BufferOutputStream()

            batch_size = op.write_batch_size or 1024
            batch_idx = 0
            batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                       (batch_idx + 1)]
            batch_data = convert_pandas_object_to_string(batch_data)
            schema = pa.RecordBatch.from_pandas(to_store_data[:1],
                                                preserve_index=False).schema
            arrow_writer = pa.RecordBatchStreamWriter(sink, schema)
            while len(batch_data) > 0:
                batch = pa.RecordBatch.from_pandas(batch_data,
                                                   preserve_index=False)
                arrow_writer.write_batch(batch)
                batch_idx += 1
                batch_data = to_store_data[batch_size * batch_idx:batch_size *
                                           (batch_idx + 1)]
            arrow_writer.close()
            cupid_writer.write(sink.getvalue())
        logger.debug('Write table block finished, block id: %s', op.block_id)

        block_writer.commit()
        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel
        import pyarrow as pa
        import pandas as pd

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            upload_session = tunnel.create_upload_session(
                t.name, partition_spec=op.partition_spec)
        else:
            upload_session = tunnel.create_upload_session(t.name)

        logger.debug('Start writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)
        writer = upload_session.open_arrow_writer(0)
        arrow_rb = pa.RecordBatch.from_pandas(ctx[op.inputs[0].key])
        writer.write(arrow_rb)
        writer.close()
        upload_session.commit([0])
        logger.debug('Finish writing table %s split index: %s', op.table_name,
                     op.inputs[0].index)
        ctx[op.outputs[0].key] = pd.DataFrame()

    @classmethod
    def execute(cls, ctx, op):
        if op.cupid_handle is not None:
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)
Example #5
0
    class RunScript(LearnMergeDictOperand):
        _op_type_ = 743210

        _code = BytesField('code')
        _mode = StringField('mode')
        _command_args = ListField('command_args')
        _world_size = Int32Field('world_size')
        _rank = Int32Field('rank')

        def __init__(self,
                     code=None,
                     mode=None,
                     command_args=None,
                     world_size=None,
                     rank=None,
                     merge=None,
                     output_types=None,
                     **kw):
            super().__init__(_code=code,
                             _mode=mode,
                             _command_args=command_args,
                             _world_size=world_size,
                             _rank=rank,
                             _merge=merge,
                             _output_types=output_types,
                             **kw)
            if self._output_types is None:
                self._output_types = [OutputType.object]

        @property
        def code(self):
            return self._code

        @property
        def mode(self):
            return self._mode

        @property
        def world_size(self):
            return self._world_size

        @property
        def rank(self):
            return self._rank

        @property
        def command_args(self):
            return self._command_args or []

        def __call__(self):
            return self.new_tileable(None)

        @classmethod
        def tile(cls, op):
            out_chunks = []
            for i in range(op.world_size):
                chunk_op = op.copy().reset_key()
                chunk_op._rank = i
                out_chunks.append(chunk_op.new_chunk(None, index=(i, )))

            new_op = op.copy()
            return new_op.new_tileables(
                op.inputs,
                chunks=out_chunks,
                nsplits=(tuple(np.nan for _ in range(len(out_chunks))), ))

        @classmethod
        def _execute_with_subprocess(cls, op, env=None):
            # write source code into a temp file
            fd, filename = tempfile.mkstemp('.py')
            with os.fdopen(fd, 'wb') as f:
                f.write(op.code)
            logger.debug('Write code to temp file.')

            env = env or dict()
            envs = os.environ.copy().update(env)
            try:
                # exec code in a new process
                process = subprocess.Popen([sys.executable, filename] +
                                           op.command_args,
                                           env=envs)
                process.wait()
                if process.returncode != 0:
                    raise RuntimeError('Run script failed')

            finally:
                os.remove(filename)

        @classmethod
        def _execute_with_exec(cls, op, local=None):
            local = local or dict()

            try:
                exec(op.code, local)
            finally:
                sys.stdout.flush()

        @classmethod
        def _set_envs(cls, ctx, op):
            scheduler_address = ctx._scheduler_address
            session_id = ctx._session_id

            # set mars envs
            env = os.environ
            env['MARS_SCHEDULER_ADDRESS'] = str(scheduler_address)
            env['MARS_SESSION_ID'] = str(session_id)
            env['RANK'] = str(op.rank)

        @classmethod
        def _build_locals(cls, ctx, op):
            logger.debug('Start to create mars session.')
            sess = ctx.get_current_session().as_default()

            return dict(session=sess)

        @classmethod
        def execute(cls, ctx, op):
            if op.merge:
                return super().execute(ctx, op)

            old_env = os.environ.copy()
            cls._set_envs(ctx, op)

            try:
                if op.mode == 'spawn':
                    cls._execute_with_subprocess(op)
                elif op.mode == 'exec':
                    cls._execute_with_exec(op,
                                           local=cls._build_locals(ctx, op))
                else:
                    raise TypeError('Unsupported mode {}'.format(op.mode))

                if op.rank == 0:
                    ctx[op.outputs[0].key] = {'status': 'ok'}
                else:
                    ctx[op.outputs[0].key] = {}
            finally:
                os.environ = old_env
Example #6
0
File: where.py Project: yyaaa1/mars
class DataFrameWhere(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = opcodes.WHERE

    _input = AnyField('input')
    _cond = AnyField('cond')
    _other = AnyField('other')
    _axis = Int32Field('axis')
    _level = AnyField('level')
    _errors = StringField('errors')
    _try_cast = BoolField('try_cast')
    _replace_true = BoolField('replace_true')

    def __init__(self, input=None, cond=None, other=None,  # pylint: disable=redefined-builtin
                 axis=None, level=None, errors=None, try_cast=None, replace_true=None, **kw):
        super().__init__(_input=input, _cond=cond, _other=other, _axis=axis, _level=level,
                         _errors=errors, _try_cast=try_cast, _replace_true=replace_true, **kw)

    @property
    def input(self):
        return self._input

    @property
    def cond(self):
        return self._cond

    @property
    def other(self):
        return self._other

    @property
    def axis(self):
        return self._axis

    @property
    def level(self):
        return self._level

    @property
    def errors(self):
        return self._errors

    @property
    def try_cast(self):
        return self._try_cast

    @property
    def replace_true(self):
        return self._replace_true

    def __call__(self, df_or_series):
        def _check_input_index(obj, axis=None):
            axis = axis if axis is not None else self.axis
            if isinstance(obj, DATAFRAME_TYPE) \
                    and (
                        df_or_series.columns_value.key != obj.columns_value.key
                        or df_or_series.index_value.key != obj.index_value.key
                    ):
                raise NotImplementedError('Aligning different indices not supported')
            elif isinstance(obj, SERIES_TYPE) \
                    and df_or_series.axes[axis].index_value.key != obj.index_value.key:
                raise NotImplementedError('Aligning different indices not supported')

        _check_input_index(self.cond, axis=0)
        _check_input_index(self.other)

        if isinstance(df_or_series, DATAFRAME_TYPE):
            mock_obj = build_df(df_or_series)
        else:
            mock_obj = build_series(df_or_series)

        if isinstance(self.other, (pd.DataFrame, DATAFRAME_TYPE)):
            mock_other = build_df(self.other)
        elif isinstance(self.other, (pd.Series, SERIES_TYPE)):
            mock_other = build_series(self.other)
        else:
            mock_other = self.other

        result_df = mock_obj.where(np.zeros(mock_obj.shape).astype(bool), other=mock_other,
                                   axis=self.axis, level=self.level, errors=self.errors,
                                   try_cast=self.try_cast)

        inputs = filter_inputs([df_or_series, self.cond, self.other])
        if isinstance(df_or_series, DATAFRAME_TYPE):
            return self.new_dataframe(inputs, shape=df_or_series.shape,
                                      dtypes=result_df.dtypes, index_value=df_or_series.index_value,
                                      columns_value=df_or_series.columns_value)
        else:
            return self.new_series(inputs, shape=df_or_series.shape, name=df_or_series.name,
                                   dtype=result_df.dtype, index_value=df_or_series.index_value)

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        inputs_iter = iter(self._inputs)
        self._input = next(inputs_iter)
        if isinstance(self._cond, (Base, Entity)):
            self._cond = next(inputs_iter)
        if isinstance(self._other, (Base, Entity)):
            self._other = next(inputs_iter)

    @classmethod
    def tile(cls, op: "DataFrameWhere"):
        def rechunk_input(inp, axis=None):
            axis = axis if axis is not None else op.axis
            if isinstance(inp, DATAFRAME_TYPE):
                inp = inp.rechunk(op.input.nsplits)._inplace_tile()
            elif isinstance(inp, SERIES_TYPE):
                inp = inp.rechunk({0: op.input.nsplits[axis]})._inplace_tile()
            return inp

        def get_tiled_chunk(obj, index, axis=None):
            if isinstance(obj, DATAFRAME_TYPE):
                return obj.cix[index[0], index[1]]
            elif isinstance(obj, SERIES_TYPE):
                axis = axis if axis is not None else op.axis
                return obj.cix[index[axis], ]
            else:
                return obj

        # TODO support axis alignment for three objects
        cond = rechunk_input(op.cond, axis=0)
        other = rechunk_input(op.other)

        chunks = []
        for c in op.input.chunks:
            cond_chunk = get_tiled_chunk(cond, c.index, axis=0)
            other_chunk = get_tiled_chunk(other, c.index)

            new_op = op.copy().reset_key()
            new_op._cond = cond_chunk
            new_op._other = other_chunk

            inputs = filter_inputs([c, cond_chunk, other_chunk])
            chunks.append(new_op.new_chunk(inputs, **c.params))

        new_op = op.copy().reset_key()
        return new_op.new_tileables(op.inputs, chunks=chunks, nsplits=op.input.nsplits,
                                    **op.input.params)

    @classmethod
    def execute(cls, ctx, op: "DataFrameWhere"):
        out_obj = op.outputs[0]

        input_data = ctx[op.input.key]
        cond = op.cond
        if isinstance(cond, (Base, Entity)):
            cond = ctx[cond.key]

        other = op.other
        if isinstance(other, (Base, Entity)):
            other = ctx[other.key]

        if op.replace_true:
            ctx[out_obj.key] = input_data.mask(cond, other, axis=op.axis, level=op.level,
                                               errors=op.errors, try_cast=op.try_cast)
        else:
            ctx[out_obj.key] = input_data.where(cond, other, axis=op.axis, level=op.level,
                                                errors=op.errors, try_cast=op.try_cast)
Example #7
0
class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')
    _columns = ListField('columns')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 sparse=None,
                 add_offset=True,
                 **kw):
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _add_offset=add_offset,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def add_offset(self):
        return self._add_offset

    def __call__(self, shape, chunk_bytes=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes)

    @classmethod
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                add_offset=op.add_offset,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(idx, 0))
            out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
class DataFrameReadTable(_Base):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')
    _columns = ListField('columns')
    _nrows = Int64Field('nrows')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _append_partitions = BoolField('append_partitions')
    _last_modified_time = Int64Field('last_modified_time')
    _with_split_meta_on_tile = BoolField('with_split_meta_on_tile')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 nrows=None,
                 sparse=None,
                 add_offset=True,
                 use_arrow_dtype=None,
                 string_as_binary=None,
                 memory_scale=None,
                 append_partitions=None,
                 last_modified_time=None,
                 with_split_meta_on_tile=False,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _nrows=nrows,
                             _sparse=sparse,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _add_offset=add_offset,
                             _append_partitions=append_partitions,
                             _last_modified_time=last_modified_time,
                             _memory_scale=memory_scale,
                             _with_split_meta_on_tile=with_split_meta_on_tile,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def nrows(self):
        return self._nrows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def add_offset(self):
        return self._add_offset

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def with_split_meta_on_tile(self):
        return self._with_split_meta_on_tile

    def get_columns(self):
        return self._columns

    def set_pruned_columns(self, columns):
        self._columns = columns

    def __call__(self, shape, chunk_bytes=None, chunk_size=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes,
                                  chunk_size=chunk_size)

    @classmethod
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.errors import CupidError
        from mars.context import get_context

        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        chunk_idx = 0

        for data_src in data_srcs:
            try:
                data_store_size = data_src.size
            except ODPSError:
                # fail to get data size, just ignore
                pass
            else:
                if data_store_size < split_size and mars_context is not None:
                    # get worker counts
                    worker_count = max(
                        len(mars_context.get_worker_addresses()), 1)
                    # data is too small, split as many as number of cores
                    split_size = data_store_size // worker_count
                    # at least 1M
                    split_size = max(split_size, 1 * 1024**2)
                    logger.debug(
                        'Input data size is too small, split_size is %s',
                        split_size)

            logger.debug(
                'Start creating download session of table %s from cupid, '
                'columns: %s', op.table_name, op.columns)
            while True:
                try:
                    download_session = cupid_session.create_download_session(
                        data_src,
                        split_size=split_size,
                        columns=op.columns,
                        with_split_meta=op.with_split_meta_on_tile)
                    break
                except CupidError:
                    logger.debug(
                        'The number of splits exceeds 100000, split_size is %s',
                        split_size)
                    if split_size >= MAX_CHUNK_SIZE:
                        raise
                    else:
                        split_size *= 2

            logger.debug('%s table splits have been created.',
                         str(len(download_session.splits)))

            meta_chunk_rows = [
                split.meta_row_count for split in download_session.splits
            ]
            if np.isnan(out_shape[0]):
                est_chunk_rows = meta_chunk_rows
            else:
                sp_file_sizes = np.array([
                    sp.split_file_end - sp.split_file_start
                    for sp in download_session.splits
                ])
                total_size = sp_file_sizes.sum()
                ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                    total_size).tolist()
                est_chunk_rows = [
                    mr if mr is not None else rr
                    for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                ]

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            logger.warning('Estimated chunk rows: %r', est_chunk_rows)

            if len(download_session.splits) == 0:
                logger.debug('Table %s has no data', op.table_name)
                chunk_op = DataFrameReadTableSplit()
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, out_shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(chunk_idx, 0))
                out_chunks.append(out_chunk)
                chunk_idx += 1
            else:
                for idx, split in enumerate(download_session.splits):
                    chunk_op = DataFrameReadTableSplit(
                        cupid_handle=to_str(split.handle),
                        split_index=split.split_index,
                        split_file_start=split.split_file_start,
                        split_file_end=split.split_file_end,
                        schema_file_start=split.schema_file_start,
                        schema_file_end=split.schema_file_end,
                        add_offset=op.add_offset,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        string_as_binary=op.string_as_binary,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=est_chunk_rows[idx],
                        partition_spec=partition_spec,
                        append_partitions=op.append_partitions,
                        meta_raw_size=split.meta_raw_size,
                        nrows=meta_chunk_rows[idx] or op.nrows,
                        memory_scale=op.memory_scale)
                    # the chunk shape is unknown
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    chunk_idx += 1
                    out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        for data_src in data_srcs:
            data_store_size = data_src.size
            shape = out_shape
            chunk_size = df.extra_params.chunk_size

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            if chunk_size is None:
                chunk_bytes = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT
                chunk_count = data_store_size // chunk_bytes + (
                    data_store_size % chunk_bytes != 0)
                chunk_size = ceildiv(shape[0], chunk_count)
                split_size = chunk_bytes
            else:
                chunk_count = ceildiv(shape[0], chunk_size)
                split_size = data_store_size // chunk_count

            for i in range(chunk_count):
                start_index = chunk_size * i
                end_index = min(chunk_size * (i + 1), shape[0])
                row_size = end_index - start_index
                chunk_op = DataFrameReadTableSplit(
                    table_name=op.table_name,
                    partition_spec=partition_spec,
                    start_index=start_index,
                    end_index=end_index,
                    nrows=op.nrows,
                    odps_params=op.odps_params,
                    columns=op.columns,
                    add_offset=op.add_offset,
                    dtypes=out_dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=row_size,
                    append_partitions=op.append_partitions,
                    memory_scale=op.memory_scale)
                index_value = parse_index(pd.RangeIndex(
                    start_index, end_index))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(row_size, out_shape[1]),
                                               dtypes=out_dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(index_start + i, 0))
                row_nsplits.append(row_size)
                out_chunks.append(out_chunk)

            index_start += chunk_count

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)

    @classmethod
    def _tile(cls, op):
        from cupid.runtime import RuntimeContext

        if RuntimeContext.is_context_ready():
            return cls._tile_cupid(op)
        else:
            return cls._tile_tunnel(op)

    if not head_can_be_opt:
        tile = _tile
class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')
    _columns = ListField('columns')
    _nrows = Int64Field('nrows')
    _use_arrow_dtype = BoolField('use_arrow_dtype')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 columns=None,
                 dtypes=None,
                 nrows=None,
                 sparse=None,
                 add_offset=True,
                 use_arrow_dtype=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _dtypes=dtypes,
                             _nrows=nrows,
                             _sparse=sparse,
                             _use_arrow_dtype=use_arrow_dtype,
                             _add_offset=add_offset,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def columns(self):
        return self._columns

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def nrows(self):
        return self._nrows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def add_offset(self):
        return self._add_offset

    def __call__(self, shape, chunk_bytes=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes)

    @classmethod
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.context import get_context

        cupid_ctx = context()
        if cupid_ctx is None:
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        try:
            data_store_size = data_src.size
        except ODPSError:
            # fail to get data size, just ignore
            pass
        else:
            if data_store_size < split_size and mars_context is not None:
                # get worker counts
                worker_count = max(len(mars_context.get_worker_addresses()), 1)
                # data is too small, split as many as number of cores
                split_size = data_store_size // worker_count
                # at least 1M
                split_size = max(split_size, 1 * 1024**2)
                logger.debug(
                    'Input data size is too small, split_size is {}'.format(
                        split_size))

        logger.debug(
            'Start creating download session of table {} from cupid.'.format(
                op.table_name))
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        if np.isnan(df.shape[0]):
            est_chunk_rows = [None] * len(download_session.splits)
        else:
            sp_file_sizes = np.array([
                sp.split_file_end - sp.split_file_start
                for sp in download_session.splits
            ])
            total_size = sp_file_sizes.sum()
            est_chunk_rows = sp_file_sizes * df.shape[0] // total_size

        logger.warning('Estimated chunk rows: %r', est_chunk_rows)

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        if len(download_session.splits) == 0:
            logger.debug('Table {} has no data'.format(op.table_name))
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(0, 0))
            out_chunks = [out_chunk]
        else:
            for idx, split in enumerate(download_session.splits):
                chunk_op = DataFrameReadTableSplit(
                    cupid_handle=to_str(split.handle),
                    split_index=split.split_index,
                    split_file_start=split.split_file_start,
                    split_file_end=split.split_file_end,
                    schema_file_start=split.schema_file_start,
                    schema_file_end=split.schema_file_end,
                    add_offset=op.add_offset,
                    dtypes=op.dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=est_chunk_rows[idx])
                # the chunk shape is unknown
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(df.dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, df.shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(idx, 0))
                out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Example #10
0
class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123451

    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 nrows=None,
                 dtypes=None,
                 sparse=None,
                 **kw):
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _nrows=nrows,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def nrows(self):
        return self._nrows

    @property
    def dtypes(self):
        return self._dtypes

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is not None:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            data = pa.Table.from_batches(batches).to_pandas()[:op.nrows]
        else:
            arrow_table = reader.read_all()
            data = arrow_table.to_pandas()
        logger.debug("Read data size is %s",
                     data.memory_usage(deep=True).sum())
        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        ctx[op.outputs[0].key] = data
class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123451

    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')

    _split_size = Int64Field('split_size')
    _estimate_rows = Int64Field('estimate_rows')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 nrows=None,
                 dtypes=None,
                 split_size=None,
                 use_arrow_dtype=None,
                 estimate_rows=None,
                 sparse=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _use_arrow_dtype=use_arrow_dtype,
                             _nrows=nrows,
                             _estimate_rows=estimate_rows,
                             _split_size=split_size,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def nrows(self):
        return self._nrows

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def split_size(self):
        return self._split_size

    @property
    def estimate_rows(self):
        return self._estimate_rows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @classmethod
    def estimate_size(cls, ctx, op):
        import numpy as np

        def is_object_dtype(dtype):
            try:
                return np.issubdtype(dtype, np.object_) \
                       or np.issubdtype(dtype, np.unicode_) \
                       or np.issubdtype(dtype, np.bytes_)
            except TypeError:  # pragma: no cover
                return False

        if op.split_size is None:
            ctx[op.outputs[0].key] = (0, 0)
            return

        arrow_size = ORC_COMPRESSION_RATIO * op.split_size
        n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)])
        if op.estimate_rows or op.nrows:
            rows = op.nrows if op.nrows is not None else op.estimate_rows
            pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD
            logger.debug('Estimate pandas memory cost: %r', pd_size)
        else:
            pd_size = arrow_size * 10 if n_strings else arrow_size

        ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size)

    @classmethod
    def execute(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        if op.cupid_handle is None:
            empty_df = pd.DataFrame()
            for name, dtype in op.outputs[0].dtypes.items():
                empty_df[name] = pd.Series(dtype=dtype)
            ctx[op.outputs[0].key] = empty_df
            return

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is not None:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            data = arrow_table_to_pandas_dataframe(
                pa.Table.from_batches(batches),
                use_arrow_dtype=op.use_arrow_dtype)[:op.nrows]
        else:
            arrow_table = reader.read_all()
            data = arrow_table_to_pandas_dataframe(
                arrow_table, use_arrow_dtype=op.use_arrow_dtype)
        data_columns = data.dtypes.index
        expected_columns = op.outputs[0].dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug("Data columns differs from output columns, "
                         "data columns: {}, output columns: {}".format(
                             data_columns, expected_columns))
            data.columns = expected_columns

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is {}, size is {}'.format(
            data.shape,
            data.memory_usage(deep=True).sum()))
        ctx[op.outputs[0].key] = data
Example #12
0
class DataFrameReadTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123450

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _dtypes = SeriesField('dtypes')
    _add_offset = BoolField('add_offset')

    def __init__(self,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 dtypes=None,
                 sparse=None,
                 add_offset=True,
                 **kw):
        super(DataFrameReadTable,
              self).__init__(_odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _add_offset=add_offset,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition(self):
        return getattr(self, '_partition_spec', None)

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def add_offset(self):
        return self._add_offset

    def __call__(self, shape, chunk_store_limit=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_store_limit=chunk_store_limit)

    @classmethod
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        download_session = cupid_session.create_download_session(
            data_src, split_size=split_size)
        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        out_count_chunks = []
        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk, out_count_chunk = chunk_op.new_chunks(
                None,
                kws=[{
                    'shape': (np.nan, df.shape[1]),
                    'dtypes': op.dtypes,
                    'index_value': index_value,
                    'columns_value': columns_value,
                    'index': (idx, )
                }, {
                    'shape': (1, ),
                    'index': (idx, )
                }])
            out_chunks.append(out_chunk)
            out_count_chunks.append(out_count_chunk)

        if op.add_offset:
            output_chunks = []
            for i, chunk in enumerate(out_chunks):
                if i == 0:
                    output_chunks.append(chunk)
                    continue
                counts = out_count_chunks[:i]
                inputs = [chunk] + counts
                output_chunk = DataFrameReadTableWithOffset(
                    dtypes=chunk.dtypes).new_chunk(
                        inputs,
                        shape=chunk.shape,
                        index=chunk.index,
                        dtypes=chunk.dtypes,
                        index_value=chunk.index_value,
                        columns_value=chunk.columns_value)
                output_chunks.append(output_chunk)
        else:
            output_chunks = out_chunks

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=output_chunks,
                                     nsplits=nsplits)
Example #13
0
class DataFrameReadTableSplit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123451

    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _dtypes = SeriesField('dtypes')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 dtypes=None,
                 sparse=None,
                 **kw):
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _dtypes=dtypes,
                             _sparse=sparse,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def output_limit(self):
        return 2

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def dtypes(self):
        return self._dtypes

    @classmethod
    def execute(cls, ctx, op):
        import numpy as np
        from cupid.io.table import TableSplit

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        data = reader.read_all().to_pandas()
        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        count = np.array([data.shape[0]])
        data_chunk, count_chunk = op.outputs
        ctx[data_chunk.key] = data
        ctx[count_chunk.key] = count
Example #14
0
class DataFrameWriteTableCommit(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123462

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _overwrite = BoolField('overwrite')
    _blocks = DictField('blocks')
    _cupid_handle = StringField('cupid_handle')
    _is_terminal = BoolField('is_terminal')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 blocks=None,
                 cupid_handle=None,
                 overwrite=False,
                 is_terminal=None,
                 **kw):
        super(DataFrameWriteTableCommit,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _blocks=blocks,
                             _overwrite=overwrite,
                             _cupid_handle=cupid_handle,
                             _is_terminal=is_terminal,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def table_name(self):
        return self._table_name

    @property
    def blocks(self):
        return self._blocks

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def is_terminal(self):
        return self._is_terminal

    @classmethod
    def execute(cls, ctx, op):
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.io.table import CupidTableUploadSession

        if op.is_terminal:
            bearer_token = context().get_bearer_token()
            account = BearerTokenAccount(bearer_token)
            o = ODPS(None, None, account=account, **op.odps_params)
            cupid_session = CupidSession(o)

            project_name, table_name = op.table_name.split('.')
            upload_session = CupidTableUploadSession(session=cupid_session,
                                                     table_name=table_name,
                                                     project_name=project_name,
                                                     handle=op.cupid_handle,
                                                     blocks=op.blocks)
            upload_session.commit(overwrite=op.overwrite)

        ctx[op.outputs[0].key] = pd.DataFrame()
Example #15
0
class TensorStoreCOO(TensorDataStore):
    _op_type_ = OperandDef.STORE_COO

    _input = KeyField('input')
    _path = StringField('path')
    _dim_cols = ListField('dim_cols', ValueType.string)
    _value_col = StringField('value_col')
    _storage_options = StringField('storage_options')
    _global_index = BoolField('global_index', default=False)
    _axis_offsets = TupleField('axis_offsets')

    def __init__(self, dtype=None, path=None, dim_cols=None, value_col=None,
                 storage_options=None, sparse=True, global_index=False, **kw):
        super(TensorStoreCOO, self).__init__(_path=path, _dim_cols=dim_cols, _value_col=value_col,
                                             _dtype=dtype, _storage_options=storage_options,
                                             _global_index=global_index, _sparse=sparse, **kw)

    @property
    def input(self):
        return self._input

    @property
    def path(self):
        return self._path

    @property
    def dim_cols(self):
        return self._dim_cols

    @property
    def value_col(self):
        return self._value_col

    @property
    def storage_options(self):
        return self._storage_options

    @property
    def global_index(self):
        return self._global_index

    @property
    def axis_offsets(self):
        return self._axis_offsets

    def _set_inputs(self, inputs):
        super(TensorStoreCOO, self)._set_inputs(inputs)
        self._input = self._inputs[0]

    def calc_shape(self, *inputs_shape):
        return (0,) * len(inputs_shape[0])

    @classmethod
    def tile(cls, op):
        in_tensor = op.input

        out_chunks = []
        out_chunk_shape = (0,) * in_tensor.ndim
        axis_offsets = [[0] + np.cumsum(ns)[:-1].tolist() for ns in in_tensor.nsplits]
        for chunk in in_tensor.chunks:
            chunk_op = op.copy().reset_key()
            chunk_path = '%s/%s.parquet' % (
                chunk_op.path, ','.join(str(j) for j in chunk.index))
            chunk_op._path = chunk_path
            chunk_op._axis_offsets = \
                tuple(axis_offsets[axis][idx] for axis, idx in enumerate(chunk.index))
            out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        return new_op.new_tensors(op.inputs, op.outputs[0].shape,
                                  chunks=out_chunks,
                                  nsplits=((0,) * len(ns) for ns in in_tensor.nsplits))

    @classmethod
    def execute(cls, ctx, op):
        import numpy as np
        import pandas as pd
        import pyarrow as pa
        import pyarrow.parquet as pq
        from ..io import open as fs_open

        to_store_data = ctx[op.inputs[0].key]
        storage_opts = json.loads(op.storage_options)
        axis_offsets = op.axis_offsets
        store_global_index = op.global_index
        dim_cols = op.dim_cols
        col_to_array = {}

        if isinstance(to_store_data, SparseNDArray):
            # sparse, convert to coo matrix
            matrix = to_store_data.raw.tocoo(copy=False)
            ndim = matrix.ndim

            if len(dim_cols) > 1:
                col_to_array[dim_cols[0]] = matrix.row
                if store_global_index:
                    # global index
                    col_to_array['global_' + dim_cols[0]] = matrix.row + axis_offsets[0]
                col_to_array[dim_cols[1]] = matrix.col
                if store_global_index:
                    col_to_array['global_' + dim_cols[1]] = matrix.col + axis_offsets[1]
            else:
                col_to_array[dim_cols[0]] = matrix.col
                if store_global_index:
                    col_to_array['global_' + dim_cols[0]] = matrix.col + axis_offsets[0]

            col_to_array[op.value_col] = matrix.data
        else:
            # dense, convert to numpy array
            arr = as_np_array(to_store_data)
            ndim = arr.ndim

            index = np.array(np.meshgrid(*[np.arange(s) for s in arr.shape])).T.reshape(-1, arr.ndim).T
            for j, col, ind in zip(range(len(dim_cols)), dim_cols, index):
                col_to_array[col] = ind
                if store_global_index:
                    col_to_array['global_' + col] = ind + axis_offsets[j]
            col_to_array[op.value_col] = arr.ravel()

        df = pd.DataFrame(col_to_array)
        if len(op.dim_cols) > ndim:
            for col in op.dim_cols[ndim:]:
                df[col] = None
        table = pa.Table.from_pandas(df)
        bio = BytesIO()
        pq.write_table(table, bio)
        bio.seek(0)

        # write oss
        with fs_open(op.path, 'wb', **storage_opts) as out_file:
            out_file.write(bio.read())

        ctx[op.outputs[0].key] = np.empty((0,) * to_store_data.ndim)
Example #16
0
class DataFrameWriteTable(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = 123460

    _dtypes = SeriesField('dtypes')

    _odps_params = DictField('odps_params')
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _overwrite = BoolField('overwrite')
    _write_batch_size = Int64Field('write_batch_size')

    def __init__(self,
                 dtypes=None,
                 odps_params=None,
                 table_name=None,
                 partition_spec=None,
                 over_write=None,
                 write_batch_size=None,
                 **kw):
        super(DataFrameWriteTable,
              self).__init__(_dtypes=dtypes,
                             _odps_params=odps_params,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _overwrite=over_write,
                             _write_batch_size=write_batch_size,
                             _object_type=ObjectType.dataframe,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def overwrite(self):
        return self._overwrite

    @property
    def write_batch_size(self):
        return self._write_batch_size

    def __call__(self, x):
        shape = (0, ) * len(x.shape)
        return self.new_dataframe([x], shape=shape)

    @classmethod
    def tile(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.dataframe.utils import build_concatenated_rows_frame

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        data_src = o.get_table(op.table_name)

        logger.debug('Start creating upload session from cupid.')
        upload_session = cupid_session.create_upload_session(data_src)

        input_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        out_chunk_shape = (0, ) * len(input_df.shape)
        blocks = {}
        for chunk in input_df.chunks:
            block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace(
                '-', '')
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                partition_spec=op.partition_spec,
                cupid_handle=to_str(upload_session.handle),
                block_id=block_id,
                write_batch_size=op.write_batch_size)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=out_chunk_shape,
                                           index=chunk.index,
                                           dtypes=chunk.dtypes)
            out_chunks.append(out_chunk)
            blocks[block_id] = op.partition_spec

        # build commit tree
        combine_size = 8
        chunks = out_chunks
        while len(chunks) > combine_size:
            new_chunks = []
            for i in range(0, len(chunks), combine_size):
                chks = chunks[i:i + combine_size]
                if len(chks) == 1:
                    chk = chks[0]
                else:
                    chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                       is_terminal=False)
                    chk = chk_op.new_chunk(chks,
                                           shape=out_chunk_shape,
                                           dtypes=op.dtypes)
                new_chunks.append(chk)
            chunks = new_chunks

        assert len(chunks) < combine_size

        commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes,
                                                    table_name=op.table_name,
                                                    blocks=blocks,
                                                    cupid_handle=to_str(
                                                        upload_session.handle),
                                                    overwrite=op.overwrite,
                                                    odps_params=op.odps_params,
                                                    is_terminal=True)
        commit_table_chunk = commit_table_op.new_chunk(chunks,
                                                       shape=out_chunk_shape,
                                                       dtypes=op.dtypes)

        out_df = op.outputs[0]
        new_op = op.copy()
        return new_op.new_dataframes(op.inputs,
                                     shape=out_df.shape,
                                     dtypes=out_df.dtypes,
                                     chunks=[commit_table_chunk],
                                     nsplits=((0, ), ) * len(out_chunk_shape))
Example #17
0
class DataFrameReadTableSplit(_Base):
    _op_type_ = 123451

    # for cupid
    _cupid_handle = StringField('cupid_handle')
    _split_index = Int64Field('split_index')
    _split_file_start = Int64Field('split_file_start')
    _split_file_end = Int64Field('split_file_end')
    _schema_file_start = Int64Field('schema_file_start')
    _schema_file_end = Int64Field('schema_file_end')
    _use_arrow_dtype = BoolField('use_arrow_dtype')
    _string_as_binary = BoolField('string_as_binary')
    _dtypes = SeriesField('dtypes')
    _nrows = Int64Field('nrows')

    # for tunnel
    _table_name = StringField('table_name')
    _partition_spec = StringField('partition_spec')
    _start_index = Int64Field('start_index')
    _end_index = Int64Field('end_index')
    _odps_params = DictField('odps_params')
    _columns = ListField('columns')

    _split_size = Int64Field('split_size')
    _append_partitions = BoolField('append_partitions')
    _estimate_rows = Int64Field('estimate_rows')
    _meta_raw_size = Int64Field('meta_raw_size')

    def __init__(self,
                 cupid_handle=None,
                 split_index=None,
                 split_file_start=None,
                 split_file_end=None,
                 schema_file_start=None,
                 schema_file_end=None,
                 table_name=None,
                 partition_spec=None,
                 start_index=None,
                 end_index=None,
                 odps_params=None,
                 columns=None,
                 nrows=None,
                 dtypes=None,
                 string_as_binary=None,
                 split_size=None,
                 use_arrow_dtype=None,
                 memory_scale=None,
                 estimate_rows=None,
                 meta_raw_size=None,
                 append_partitions=None,
                 sparse=None,
                 **kw):
        kw.update(_output_type_kw)
        super(DataFrameReadTableSplit,
              self).__init__(_cupid_handle=cupid_handle,
                             _split_index=split_index,
                             _split_file_start=split_file_start,
                             _split_file_end=split_file_end,
                             _schema_file_start=schema_file_start,
                             _schema_file_end=schema_file_end,
                             _table_name=table_name,
                             _partition_spec=partition_spec,
                             _columns=columns,
                             _start_index=start_index,
                             _end_index=end_index,
                             _odps_params=odps_params,
                             _use_arrow_dtype=use_arrow_dtype,
                             _string_as_binary=string_as_binary,
                             _nrows=nrows,
                             _estimate_rows=estimate_rows,
                             _split_size=split_size,
                             _dtypes=dtypes,
                             _append_partitions=append_partitions,
                             _sparse=sparse,
                             _meta_raw_size=meta_raw_size,
                             _memory_scale=memory_scale,
                             **kw)

    @property
    def retryable(self):
        return False

    @property
    def output_limit(self):
        return 1

    @property
    def cupid_handle(self):
        return self._cupid_handle

    @property
    def split_index(self):
        return self._split_index

    @property
    def split_file_start(self):
        return self._split_file_start

    @property
    def split_file_end(self):
        return self._split_file_end

    @property
    def schema_file_start(self):
        return self._schema_file_start

    @property
    def schema_file_end(self):
        return self._schema_file_end

    @property
    def table_name(self):
        return self._table_name

    @property
    def partition_spec(self):
        return self._partition_spec

    @property
    def start_index(self):
        return self._start_index

    @property
    def end_index(self):
        return self._end_index

    @property
    def odps_params(self):
        return self._odps_params

    @property
    def columns(self):
        return self._columns

    @property
    def nrows(self):
        return self._nrows

    @property
    def dtypes(self):
        return self._dtypes

    @property
    def split_size(self):
        return self._split_size

    @property
    def estimate_rows(self):
        return self._estimate_rows

    @property
    def use_arrow_dtype(self):
        return self._use_arrow_dtype

    @property
    def string_as_binary(self):
        return self._string_as_binary

    @property
    def append_partitions(self):
        return self._append_partitions

    @property
    def meta_raw_size(self):
        return self._meta_raw_size

    @classmethod
    def estimate_size(cls, ctx, op):
        import numpy as np

        def is_object_dtype(dtype):
            try:
                return np.issubdtype(dtype, np.object_) \
                       or np.issubdtype(dtype, np.unicode_) \
                       or np.issubdtype(dtype, np.bytes_)
            except TypeError:  # pragma: no cover
                return False

        if op.split_size is None:
            ctx[op.outputs[0].key] = (0, 0)
            return

        arrow_size = (op.memory_scale or ORC_COMPRESSION_RATIO) * op.split_size
        if op.meta_raw_size is not None:
            raw_arrow_size = (op.memory_scale or 1) * op.meta_raw_size
            arrow_size = max(arrow_size, raw_arrow_size)

        n_strings = len([dt for dt in op.dtypes if is_object_dtype(dt)])
        if op.estimate_rows or op.nrows:
            rows = op.nrows if op.nrows is not None else op.estimate_rows
            pd_size = arrow_size + n_strings * rows * STRING_FIELD_OVERHEAD
            logger.debug('Estimate pandas memory cost: %r', pd_size)
        else:
            pd_size = arrow_size * 10 if n_strings else arrow_size

        ctx[op.outputs[0].key] = (pd_size, pd_size + arrow_size)

    @classmethod
    def _cast_string_to_binary(cls, arrow_table):
        import pyarrow as pa

        new_schema = []
        for field in arrow_table.schema:
            if field.type == pa.string():
                new_schema.append(pa.field(field.name, pa.binary()))
            else:
                new_schema.append(field)

        return arrow_table.cast(pa.schema(new_schema))

    @classmethod
    def _append_partition_values(cls, arrow_table, op):
        import pyarrow as pa

        if op.append_partitions and op.partition_spec:
            from odps.types import PartitionSpec
            spec = PartitionSpec(op.partition_spec)

            for col_name, pt_val in spec.items():
                arrow_table = arrow_table.append_column(
                    col_name,
                    pa.array([pt_val] * arrow_table.num_rows, pa.string()))

        return arrow_table

    @staticmethod
    def _align_columns(data, expected_dtypes):
        data_columns = data.dtypes.index
        expected_columns = expected_dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug(
                "Data columns differs from output columns, "
                "data columns: %s, output columns: %s", data_columns,
                expected_columns)
            data.columns = expected_columns[:len(data.columns)]
            for extra_col in expected_columns[len(data.columns):]:
                data[extra_col] = pd.Series([],
                                            dtype=expected_dtypes[extra_col])
            if not data.dtypes.index.equals(expected_columns):
                data = data[expected_columns]
        return data

    @classmethod
    def _execute_in_cupid(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        out = op.outputs[0]

        if op.cupid_handle is None:
            empty_df = pd.DataFrame()
            for name, dtype in out.dtypes.items():
                empty_df[name] = pd.Series(dtype=dtype)
            ctx[out.key] = empty_df
            return

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is None:
            arrow_table = reader.read_all()
        else:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            arrow_table = pa.Table.from_batches(batches)

        arrow_table = cls._append_partition_values(arrow_table, op)

        if op.string_as_binary:
            arrow_table = cls._cast_string_to_binary(arrow_table)
        data = arrow_table_to_pandas_dataframe(
            arrow_table, use_arrow_dtype=op.use_arrow_dtype)
        if op.nrows is not None:
            data = data[:op.nrows]

        data = cls._align_columns(data, out.dtypes)

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is %s, size is %s', data.shape,
                     data.memory_usage(deep=True).sum())
        ctx[out.key] = data

    @classmethod
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            download_session = tunnel.create_download_session(
                t.name, partition_spec=op.partition_spec)
        else:
            download_session = tunnel.create_download_session(t.name)
        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        with download_session.open_arrow_reader(op.start_index,
                                                count,
                                                columns=op.columns) as reader:
            table = reader.read()

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)

        data = cls._align_columns(data, op.outputs[0].dtypes)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data

    @classmethod
    def execute(cls, ctx, op):
        from cupid.runtime import RuntimeContext

        if RuntimeContext.is_context_ready():
            cls._execute_in_cupid(ctx, op)
        else:
            cls._execute_arrow_tunnel(ctx, op)