def __call__(self, x):
     shape = (0, ) * len(x.shape)
     index_value = parse_index(x.index_value.to_pandas()[:0], x.key,
                               'index')
     columns_value = parse_index(x.columns_value.to_pandas()[:0],
                                 x.key,
                                 'columns',
                                 store_data=True)
     return self.new_dataframe([x],
                               shape=shape,
                               dtypes=x.dtypes[:0],
                               index_value=index_value,
                               columns_value=columns_value)
Beispiel #2
0
    def __call__(self, shape, chunk_bytes=None):
        import numpy as np
        import pandas as pd

        if np.isnan(shape[0]):
            index_value = parse_index(pd.RangeIndex(0))
        else:
            index_value = parse_index(pd.RangeIndex(shape[0]))
        columns_value = parse_index(self.dtypes.index, store_data=True)
        return self.new_dataframe(None,
                                  shape,
                                  dtypes=self.dtypes,
                                  index_value=index_value,
                                  columns_value=columns_value,
                                  chunk_bytes=chunk_bytes)
    def _tile_tunnel(cls, op):
        out_df = op.outputs[0]
        in_df = build_concatenated_rows_frame(op.inputs[0])

        out_chunks = []
        for chunk in in_df.chunks:
            chunk_op = DataFrameWriteTableSplit(
                dtypes=op.dtypes,
                table_name=op.table_name,
                odps_params=op.odps_params,
                partition_spec=op.partition_spec)
            index_value = parse_index(chunk.index_value.to_pandas()[:0], chunk)
            out_chunk = chunk_op.new_chunk([chunk],
                                           shape=(0, 0),
                                           index_value=index_value,
                                           columns_value=out_df.columns_value,
                                           dtypes=out_df.dtypes,
                                           index=chunk.index)
            out_chunks.append(out_chunk)

        new_op = op.copy()
        params = out_df.params.copy()
        params.update(
            dict(chunks=out_chunks,
                 nsplits=((0, ) * in_df.chunk_shape[0], (0, ))))
        return new_op.new_tileables([in_df], **params)
Beispiel #4
0
def test_merge_index_value():
    with Timer() as timer:
        index_values = {i: parse_index(pd.RangeIndex(1e7)) for i in range(20)}
        index_value = merge_index_value(index_values)
        pd.testing.assert_index_equal(index_value.to_pandas(),
                                      pd.Index([], dtype=np.int64))
        assert index_value.min_val == 0
        assert index_value.max_val == 1e7 - 1

        # range indexes that are continuous
        index_values = {
            i: parse_index(pd.RangeIndex(i * 1e7, (i + 1) * 1e7))
            for i in range(20)
        }
        index_value = merge_index_value(index_values)
        pd.testing.assert_index_equal(index_value.to_pandas(),
                                      pd.RangeIndex(1e7 * 20))
        assert index_value.min_val == 0
        assert index_value.max_val == 1e7 * 20 - 1
    assert timer.duration < 1
Beispiel #5
0
    def testParseIndex(self):
        index = pd.Int64Index([])
        parsed_index = parse_index(index)
        self.assertIsInstance(parsed_index.value, IndexValue.Int64Index)
        pd.testing.assert_index_equal(index, parsed_index.to_pandas())

        index = pd.Int64Index([1, 2])
        parsed_index = parse_index(index)  # not parse data
        self.assertIsInstance(parsed_index.value, IndexValue.Int64Index)
        with self.assertRaises(AssertionError):
            pd.testing.assert_index_equal(index, parsed_index.to_pandas())

        parsed_index = parse_index(index, store_data=True)  # parse data
        self.assertIsInstance(parsed_index.value, IndexValue.Int64Index)
        pd.testing.assert_index_equal(index, parsed_index.to_pandas())

        index = pd.RangeIndex(0, 10, 3)
        parsed_index = parse_index(index)
        self.assertIsInstance(parsed_index.value, IndexValue.RangeIndex)
        pd.testing.assert_index_equal(index, parsed_index.to_pandas())

        index = pd.MultiIndex.from_arrays([[0, 1], ['a', 'b']])
        parsed_index = parse_index(index)  # not parse data
        self.assertIsInstance(parsed_index.value, IndexValue.MultiIndex)
        with self.assertRaises(AssertionError):
            pd.testing.assert_index_equal(index, parsed_index.to_pandas())

        parsed_index = parse_index(index, store_data=True)  # parse data
        self.assertIsInstance(parsed_index.value, IndexValue.MultiIndex)
        pd.testing.assert_index_equal(index, parsed_index.to_pandas())
Beispiel #6
0
    def testFilterIndexValue(self):
        pd_index = pd.RangeIndex(10)
        index_value = parse_index(pd_index)

        min_max = (0, True, 9, True)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist())

        min_max = (0, False, 9, False)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index > 0) & (pd_index < 9)].tolist())

        pd_index = pd.RangeIndex(1, 11, 3)
        index_value = parse_index(pd_index)

        min_max = (2, True, 10, True)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist())

        min_max = (2, False, 10, False)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index > 2) & (pd_index < 10)].tolist())

        pd_index = pd.RangeIndex(9, -1, -1)
        index_value = parse_index(pd_index)

        min_max = (0, True, 9, True)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist())

        min_max = (0, False, 9, False)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index > 0) & (pd_index < 9)].tolist())

        pd_index = pd.RangeIndex(10, 0, -3)
        index_value = parse_index(pd_index, store_data=False)

        min_max = (2, True, 10, True)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist())

        min_max = (2, False, 10, False)
        self.assertEqual(filter_index_value(index_value, min_max).to_pandas().tolist(),
                         pd_index[(pd_index > 2) & (pd_index < 10)].tolist())

        pd_index = pd.Int64Index([0, 3, 8])
        index_value = parse_index(pd_index, store_data=True)

        min_max = (2, True, 8, False)
        self.assertEqual(filter_index_value(index_value, min_max, store_data=True).to_pandas().tolist(),
                         pd_index[(pd_index >= 2) & (pd_index < 8)].tolist())

        index_value = parse_index(pd_index)

        min_max = (2, True, 8, False)
        filtered = filter_index_value(index_value, min_max)
        self.assertEqual(len(filtered.to_pandas().tolist()), 0)
        self.assertIsInstance(filtered.value, IndexValue.Int64Index)
Beispiel #7
0
    def testInferIndexValue(self):
        # same range index
        index1 = pd.RangeIndex(1, 3)
        index2 = pd.RangeIndex(1, 3)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertEqual(oival.key, ival1.key)
        self.assertEqual(oival.key, ival2.key)

        # different range index
        index1 = pd.RangeIndex(1, 3)
        index2 = pd.RangeIndex(2, 4)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # same int64 index, all unique
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Int64Index([1, 2])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertEqual(oival.key, ival1.key)
        self.assertEqual(oival.key, ival2.key)

        # same int64 index, not all unique
        index1 = pd.Int64Index([1, 2, 2])
        index2 = pd.Int64Index([1, 2, 2])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # different int64 index
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Int64Index([2, 3])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Int64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # different index type
        index1 = pd.Int64Index([1, 2])
        index2 = pd.Float64Index([2.0, 3.0])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Float64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        # range index and other index
        index1 = pd.RangeIndex(1, 4)
        index2 = pd.Float64Index([2, 3, 4])

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Float64Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)

        index1 = pd.DatetimeIndex([])
        index2 = pd.RangeIndex(2)

        ival1 = parse_index(index1)
        ival2 = parse_index(index2)
        oival = infer_index_value(ival1, ival2)

        self.assertIsInstance(oival.value, IndexValue.Index)
        self.assertNotEqual(oival.key, ival1.key)
        self.assertNotEqual(oival.key, ival2.key)
Beispiel #8
0
def test_infer_index_value():
    # same range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(1, 3)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # different range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(2, 4)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # same int64 index, all unique
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([1, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # same int64 index, not all unique
    index1 = pd.Int64Index([1, 2, 2])
    index2 = pd.Int64Index([1, 2, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different int64 index
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([2, 3])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different index type
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Float64Index([2.0, 3.0])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # range index and other index
    index1 = pd.RangeIndex(1, 4)
    index2 = pd.Float64Index([2, 3, 4])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    index1 = pd.DatetimeIndex([])
    index2 = pd.RangeIndex(2)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key
Beispiel #9
0
def test_filter_index_value():
    pd_index = pd.RangeIndex(10)
    index_value = parse_index(pd_index)

    min_max = (0, True, 9, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 0)
                                                  & (pd_index <= 9)].tolist()

    min_max = (0, False, 9, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 0)
                                                  & (pd_index < 9)].tolist()

    pd_index = pd.RangeIndex(1, 11, 3)
    index_value = parse_index(pd_index)

    min_max = (2, True, 10, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 2)
                                                  & (pd_index <= 10)].tolist()

    min_max = (2, False, 10, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 2)
                                                  & (pd_index < 10)].tolist()

    pd_index = pd.RangeIndex(9, -1, -1)
    index_value = parse_index(pd_index)

    min_max = (0, True, 9, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 0)
                                                  & (pd_index <= 9)].tolist()

    min_max = (0, False, 9, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 0)
                                                  & (pd_index < 9)].tolist()

    pd_index = pd.RangeIndex(10, 0, -3)
    index_value = parse_index(pd_index, store_data=False)

    min_max = (2, True, 10, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 2)
                                                  & (pd_index <= 10)].tolist()

    min_max = (2, False, 10, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 2)
                                                  & (pd_index < 10)].tolist()

    pd_index = pd.Int64Index([0, 3, 8])
    index_value = parse_index(pd_index, store_data=True)

    min_max = (2, True, 8, False)
    assert filter_index_value(
        index_value, min_max,
        store_data=True).to_pandas().tolist() == pd_index[
            (pd_index >= 2) & (pd_index < 8)].tolist()

    index_value = parse_index(pd_index)

    min_max = (2, True, 8, False)
    filtered = filter_index_value(index_value, min_max)
    assert len(filtered.to_pandas().tolist()) == 0
    assert isinstance(filtered.value, IndexValue.Int64Index)
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from mars.context import get_context

        cupid_ctx = context()
        if cupid_ctx is None:
            raise SystemError(
                'No Mars cluster found, please create via `o.create_mars_cluster`.'
            )

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        try:
            data_store_size = data_src.size
        except ODPSError:
            # fail to get data size, just ignore
            pass
        else:
            if data_store_size < split_size and mars_context is not None:
                # get worker counts
                worker_count = max(len(mars_context.get_worker_addresses()), 1)
                # data is too small, split as many as number of cores
                split_size = data_store_size // worker_count
                # at least 1M
                split_size = max(split_size, 1 * 1024**2)
                logger.debug(
                    'Input data size is too small, split_size is {}'.format(
                        split_size))

        logger.debug(
            'Start creating download session of table {} from cupid.'.format(
                op.table_name))
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        if np.isnan(df.shape[0]):
            est_chunk_rows = [None] * len(download_session.splits)
        else:
            sp_file_sizes = np.array([
                sp.split_file_end - sp.split_file_start
                for sp in download_session.splits
            ])
            total_size = sp_file_sizes.sum()
            est_chunk_rows = sp_file_sizes * df.shape[0] // total_size

        logger.warning('Estimated chunk rows: %r', est_chunk_rows)

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        if len(download_session.splits) == 0:
            logger.debug('Table {} has no data'.format(op.table_name))
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(0, 0))
            out_chunks = [out_chunk]
        else:
            for idx, split in enumerate(download_session.splits):
                chunk_op = DataFrameReadTableSplit(
                    cupid_handle=to_str(split.handle),
                    split_index=split.split_index,
                    split_file_start=split.split_file_start,
                    split_file_end=split.split_file_end,
                    schema_file_start=split.schema_file_start,
                    schema_file_end=split.schema_file_end,
                    add_offset=op.add_offset,
                    dtypes=op.dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=est_chunk_rows[idx])
                # the chunk shape is unknown
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(df.dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, df.shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(idx, 0))
                out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Beispiel #11
0
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        o = ODPS(None, None, account=account, **odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        while True:
            try:
                download_session = cupid_session.create_download_session(
                    data_src, split_size=split_size, columns=op.columns)
                break
            except CupidError:
                logger.debug(
                    'The number of splits exceeds 100000, split_size is {}'.
                    format(split_size))
                if split_size >= MAX_CHUNK_SIZE:
                    raise
                else:
                    split_size *= 2

        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        # Ignore add_offset at this time.
        op._add_offset = False

        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                add_offset=op.add_offset,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(np.nan, df.shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(idx, 0))
            out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Beispiel #12
0
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        if len(data_srcs) == 0:
            # no partitions are selected
            chunk_op = DataFrameReadTableSplit()
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(out_dtypes.index, store_data=True)
            out_chunk = chunk_op.new_chunk(None,
                                           shape=(0, out_shape[1]),
                                           dtypes=op.dtypes,
                                           index_value=index_value,
                                           columns_value=columns_value,
                                           index=(index_start, 0))
            out_chunks.append(out_chunk)
        else:
            retry_times = op.retry_times or options.retry_times
            for data_src in data_srcs:
                data_store_size = data_src.size

                retries = 0
                while True:
                    try:
                        with data_src.open_reader() as reader:
                            record_count = reader.count
                        break
                    except:
                        if retries >= retry_times:
                            raise
                        retries += 1
                        time.sleep(1)
                if data_store_size == 0:
                    # empty table
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(0, out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start, 0))
                    out_chunks.append(out_chunk)
                    index_start += 1
                    continue
                chunk_size = df.extra_params.chunk_size

                partition_spec = str(data_src.partition_spec) \
                    if getattr(data_src, 'partition_spec', None) else None

                if chunk_size is None:
                    chunk_bytes = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT
                    chunk_count = data_store_size // chunk_bytes + (
                        data_store_size % chunk_bytes != 0)
                    chunk_size = ceildiv(record_count, chunk_count)
                    split_size = chunk_bytes
                else:
                    chunk_count = ceildiv(record_count, chunk_size)
                    split_size = data_store_size // chunk_count

                for i in range(chunk_count):
                    start_index = chunk_size * i
                    end_index = min(chunk_size * (i + 1), record_count)
                    row_size = end_index - start_index
                    chunk_op = DataFrameReadTableSplit(
                        table_name=op.table_name,
                        partition_spec=partition_spec,
                        start_index=start_index,
                        end_index=end_index,
                        nrows=op.nrows,
                        odps_params=op.odps_params,
                        columns=op.columns,
                        incremental_index=op.incremental_index,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=row_size,
                        append_partitions=op.append_partitions,
                        memory_scale=op.memory_scale,
                        retry_times=op.retry_times,
                        extra_params=op.extra_params)
                    index_value = parse_index(
                        pd.RangeIndex(start_index, end_index))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(row_size,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(index_start + i, 0))
                    row_nsplits.append(row_size)
                    out_chunks.append(out_chunk)

                index_start += chunk_count

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Beispiel #13
0
    def _tile_cupid(cls, op):
        import numpy as np
        import pandas as pd
        from mars.core.context import get_context

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or CHUNK_BYTES_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        mars_context = get_context()
        if mars_context is not None:
            worker_count = len(mars_context.get_worker_addresses())
        else:
            worker_count = None

        cupid_client = CupidServiceClient()
        try:
            parts = cupid_client.enum_table_partitions(op.odps_params,
                                                       op.table_name,
                                                       op.partition)
            if parts is None:
                parts = [None]

            out_chunks = []
            chunk_idx = 0

            for partition_spec in parts:
                splits, split_size = cupid_client.create_table_download_session(
                    op.odps_params, op.table_name, partition_spec, op.columns,
                    worker_count, split_size, MAX_CHUNK_NUM,
                    op.with_split_meta_on_tile)

                logger.debug('%s table splits have been created.',
                             str(len(splits)))

                meta_chunk_rows = [split.meta_row_count for split in splits]
                if np.isnan(out_shape[0]):
                    est_chunk_rows = meta_chunk_rows
                else:
                    sp_file_sizes = np.array([
                        sp.split_file_end - sp.split_file_start
                        for sp in splits
                    ])
                    total_size = sp_file_sizes.sum()
                    ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                        total_size).tolist()
                    est_chunk_rows = [
                        mr if mr is not None else rr
                        for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                    ]

                logger.warning('Estimated chunk rows: %r', est_chunk_rows)

                if len(splits) == 0:
                    logger.debug('Table %s has no data', op.table_name)
                    chunk_op = DataFrameReadTableSplit()
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=op.dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    out_chunks.append(out_chunk)
                    chunk_idx += 1
                else:
                    for idx, split in enumerate(splits):
                        chunk_op = DataFrameReadTableSplit(
                            cupid_handle=to_str(split.handle),
                            split_index=split.split_index,
                            split_file_start=split.split_file_start,
                            split_file_end=split.split_file_end,
                            schema_file_start=split.schema_file_start,
                            schema_file_end=split.schema_file_end,
                            incremental_index=op.incremental_index,
                            dtypes=out_dtypes,
                            sparse=op.sparse,
                            split_size=split_size,
                            string_as_binary=op.string_as_binary,
                            use_arrow_dtype=op.use_arrow_dtype,
                            estimate_rows=est_chunk_rows[idx],
                            partition_spec=partition_spec,
                            append_partitions=op.append_partitions,
                            meta_raw_size=split.meta_raw_size,
                            nrows=meta_chunk_rows[idx] or op.nrows,
                            memory_scale=op.memory_scale,
                            extra_params=op.extra_params)
                        # the chunk shape is unknown
                        index_value = parse_index(pd.RangeIndex(0))
                        columns_value = parse_index(out_dtypes.index,
                                                    store_data=True)
                        out_chunk = chunk_op.new_chunk(
                            None,
                            shape=(np.nan, out_shape[1]),
                            dtypes=out_dtypes,
                            index_value=index_value,
                            columns_value=columns_value,
                            index=(chunk_idx, 0))
                        chunk_idx += 1
                        out_chunks.append(out_chunk)
        finally:
            cupid_client.close()

        if op.incremental_index and _NEED_STANDARDIZE:
            out_chunks = standardize_range_index(out_chunks)
        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
Beispiel #14
0
    def tile(cls, op):
        import numpy as np
        import pandas as pd
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context

        bearer_token = context().get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        o = ODPS(None, None, account=account, **op.odps_params)
        cupid_session = CupidSession(o)

        df = op.outputs[0]
        split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit

        data_src = o.get_table(op.table_name)
        if op.partition is not None:
            data_src = data_src.get_partition(op.partition)

        logger.debug('Start creating download session from cupid.')
        download_session = cupid_session.create_download_session(
            data_src, split_size=split_size)
        logger.debug('%s table splits have been created.',
                     str(len(download_session.splits)))

        out_chunks = []
        out_count_chunks = []
        for idx, split in enumerate(download_session.splits):
            chunk_op = DataFrameReadTableSplit(
                cupid_handle=to_str(split.handle),
                split_index=split.split_index,
                split_file_start=split.split_file_start,
                split_file_end=split.split_file_end,
                schema_file_start=split.schema_file_start,
                schema_file_end=split.schema_file_end,
                dtypes=op.dtypes,
                sparse=op.sparse)
            # the chunk shape is unknown
            index_value = parse_index(pd.RangeIndex(0))
            columns_value = parse_index(df.dtypes.index, store_data=True)
            out_chunk, out_count_chunk = chunk_op.new_chunks(
                None,
                kws=[{
                    'shape': (np.nan, df.shape[1]),
                    'dtypes': op.dtypes,
                    'index_value': index_value,
                    'columns_value': columns_value,
                    'index': (idx, )
                }, {
                    'shape': (1, ),
                    'index': (idx, )
                }])
            out_chunks.append(out_chunk)
            out_count_chunks.append(out_count_chunk)

        if op.add_offset:
            output_chunks = []
            for i, chunk in enumerate(out_chunks):
                if i == 0:
                    output_chunks.append(chunk)
                    continue
                counts = out_count_chunks[:i]
                inputs = [chunk] + counts
                output_chunk = DataFrameReadTableWithOffset(
                    dtypes=chunk.dtypes).new_chunk(
                        inputs,
                        shape=chunk.shape,
                        index=chunk.index,
                        dtypes=chunk.dtypes,
                        index_value=chunk.index_value,
                        columns_value=chunk.columns_value)
                output_chunks.append(output_chunk)
        else:
            output_chunks = out_chunks

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=df.shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=df.columns_value,
                                     chunks=output_chunks,
                                     nsplits=nsplits)
    def _tile_tunnel(cls, op):
        from odps import ODPS

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        row_nsplits = []
        index_start = 0
        df = op.outputs[0]

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        for data_src in data_srcs:
            data_store_size = data_src.size
            shape = out_shape
            chunk_size = df.extra_params.chunk_size

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            if chunk_size is None:
                chunk_bytes = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT
                chunk_count = data_store_size // chunk_bytes + (
                    data_store_size % chunk_bytes != 0)
                chunk_size = ceildiv(shape[0], chunk_count)
                split_size = chunk_bytes
            else:
                chunk_count = ceildiv(shape[0], chunk_size)
                split_size = data_store_size // chunk_count

            for i in range(chunk_count):
                start_index = chunk_size * i
                end_index = min(chunk_size * (i + 1), shape[0])
                row_size = end_index - start_index
                chunk_op = DataFrameReadTableSplit(
                    table_name=op.table_name,
                    partition_spec=partition_spec,
                    start_index=start_index,
                    end_index=end_index,
                    nrows=op.nrows,
                    odps_params=op.odps_params,
                    columns=op.columns,
                    add_offset=op.add_offset,
                    dtypes=out_dtypes,
                    sparse=op.sparse,
                    split_size=split_size,
                    use_arrow_dtype=op.use_arrow_dtype,
                    estimate_rows=row_size,
                    append_partitions=op.append_partitions,
                    memory_scale=op.memory_scale)
                index_value = parse_index(pd.RangeIndex(
                    start_index, end_index))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(row_size, out_shape[1]),
                                               dtypes=out_dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(index_start + i, 0))
                row_nsplits.append(row_size)
                out_chunks.append(out_chunk)

            index_start += chunk_count

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = (tuple(row_nsplits), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)
    def _tile_cupid(cls, op):
        from odps import ODPS
        from odps.accounts import BearerTokenAccount
        from cupid import CupidSession, context
        from cupid.errors import CupidError
        from mars.context import get_context

        cupid_ctx = context()

        bearer_token = cupid_ctx.get_bearer_token()
        account = BearerTokenAccount(bearer_token)
        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(None,
                 None,
                 account=account,
                 project=odps_params['project'],
                 endpoint=endpoint)
        cupid_session = CupidSession(o)

        mars_context = get_context()

        df = op.outputs[0]
        split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT

        out_dtypes = df.dtypes
        out_shape = df.shape
        out_columns_value = df.columns_value
        if op.columns is not None:
            out_dtypes = out_dtypes[op.columns]
            out_shape = (df.shape[0], len(op.columns))
            out_columns_value = parse_index(out_dtypes.index, store_data=True)

        table_obj = o.get_table(op.table_name)
        if not table_obj.schema.partitions:
            data_srcs = [table_obj]
        elif op.partition is not None and check_partition_exist(
                table_obj, op.partition):
            data_srcs = [table_obj.get_partition(op.partition)]
        else:
            data_srcs = list(table_obj.partitions)
            if op.partition is not None:
                data_srcs = filter_partitions(o, data_srcs, op.partition)

        out_chunks = []
        chunk_idx = 0

        for data_src in data_srcs:
            try:
                data_store_size = data_src.size
            except ODPSError:
                # fail to get data size, just ignore
                pass
            else:
                if data_store_size < split_size and mars_context is not None:
                    # get worker counts
                    worker_count = max(
                        len(mars_context.get_worker_addresses()), 1)
                    # data is too small, split as many as number of cores
                    split_size = data_store_size // worker_count
                    # at least 1M
                    split_size = max(split_size, 1 * 1024**2)
                    logger.debug(
                        'Input data size is too small, split_size is %s',
                        split_size)

            logger.debug(
                'Start creating download session of table %s from cupid, '
                'columns: %s', op.table_name, op.columns)
            while True:
                try:
                    download_session = cupid_session.create_download_session(
                        data_src,
                        split_size=split_size,
                        columns=op.columns,
                        with_split_meta=op.with_split_meta_on_tile)
                    break
                except CupidError:
                    logger.debug(
                        'The number of splits exceeds 100000, split_size is %s',
                        split_size)
                    if split_size >= MAX_CHUNK_SIZE:
                        raise
                    else:
                        split_size *= 2

            logger.debug('%s table splits have been created.',
                         str(len(download_session.splits)))

            meta_chunk_rows = [
                split.meta_row_count for split in download_session.splits
            ]
            if np.isnan(out_shape[0]):
                est_chunk_rows = meta_chunk_rows
            else:
                sp_file_sizes = np.array([
                    sp.split_file_end - sp.split_file_start
                    for sp in download_session.splits
                ])
                total_size = sp_file_sizes.sum()
                ratio_chunk_rows = (sp_file_sizes * out_shape[0] //
                                    total_size).tolist()
                est_chunk_rows = [
                    mr if mr is not None else rr
                    for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows)
                ]

            partition_spec = str(data_src.partition_spec) \
                if getattr(data_src, 'partition_spec', None) else None

            logger.warning('Estimated chunk rows: %r', est_chunk_rows)

            if len(download_session.splits) == 0:
                logger.debug('Table %s has no data', op.table_name)
                chunk_op = DataFrameReadTableSplit()
                index_value = parse_index(pd.RangeIndex(0))
                columns_value = parse_index(out_dtypes.index, store_data=True)
                out_chunk = chunk_op.new_chunk(None,
                                               shape=(np.nan, out_shape[1]),
                                               dtypes=op.dtypes,
                                               index_value=index_value,
                                               columns_value=columns_value,
                                               index=(chunk_idx, 0))
                out_chunks.append(out_chunk)
                chunk_idx += 1
            else:
                for idx, split in enumerate(download_session.splits):
                    chunk_op = DataFrameReadTableSplit(
                        cupid_handle=to_str(split.handle),
                        split_index=split.split_index,
                        split_file_start=split.split_file_start,
                        split_file_end=split.split_file_end,
                        schema_file_start=split.schema_file_start,
                        schema_file_end=split.schema_file_end,
                        add_offset=op.add_offset,
                        dtypes=out_dtypes,
                        sparse=op.sparse,
                        split_size=split_size,
                        string_as_binary=op.string_as_binary,
                        use_arrow_dtype=op.use_arrow_dtype,
                        estimate_rows=est_chunk_rows[idx],
                        partition_spec=partition_spec,
                        append_partitions=op.append_partitions,
                        meta_raw_size=split.meta_raw_size,
                        nrows=meta_chunk_rows[idx] or op.nrows,
                        memory_scale=op.memory_scale)
                    # the chunk shape is unknown
                    index_value = parse_index(pd.RangeIndex(0))
                    columns_value = parse_index(out_dtypes.index,
                                                store_data=True)
                    out_chunk = chunk_op.new_chunk(None,
                                                   shape=(np.nan,
                                                          out_shape[1]),
                                                   dtypes=out_dtypes,
                                                   index_value=index_value,
                                                   columns_value=columns_value,
                                                   index=(chunk_idx, 0))
                    chunk_idx += 1
                    out_chunks.append(out_chunk)

        if op.add_offset:
            out_chunks = standardize_range_index(out_chunks)

        new_op = op.copy()
        nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], ))
        return new_op.new_dataframes(None,
                                     shape=out_shape,
                                     dtypes=op.dtypes,
                                     index_value=df.index_value,
                                     columns_value=out_columns_value,
                                     chunks=out_chunks,
                                     nsplits=nsplits)