def execute(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        if op.cupid_handle is None:
            empty_df = pd.DataFrame()
            for name, dtype in op.outputs[0].dtypes.items():
                empty_df[name] = pd.Series(dtype=dtype)
            ctx[op.outputs[0].key] = empty_df
            return

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is not None:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            data = arrow_table_to_pandas_dataframe(
                pa.Table.from_batches(batches),
                use_arrow_dtype=op.use_arrow_dtype)[:op.nrows]
        else:
            arrow_table = reader.read_all()
            data = arrow_table_to_pandas_dataframe(
                arrow_table, use_arrow_dtype=op.use_arrow_dtype)
        data_columns = data.dtypes.index
        expected_columns = op.outputs[0].dtypes.index
        if not data_columns.equals(expected_columns):
            logger.debug("Data columns differs from output columns, "
                         "data columns: {}, output columns: {}".format(
                             data_columns, expected_columns))
            data.columns = expected_columns

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is {}, size is {}'.format(
            data.shape,
            data.memory_usage(deep=True).sum()))
        ctx[op.outputs[0].key] = data
Beispiel #2
0
    def _execute_in_cupid(cls, ctx, op):
        out = op.outputs[0]

        if op.cupid_handle is None:
            ctx[out.key] = cls._build_empty_df(out)
            return

        split_config = dict(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        cupid_client = CupidServiceClient()
        try:
            pa_table = cupid_client.read_table_data(split_config, op.nrows)
        finally:
            cupid_client.close()
            cupid_client = None
        pa_table = cls._append_partition_values(pa_table, op)

        if op.string_as_binary:
            pa_table = cls._cast_string_to_binary(pa_table)
        data = arrow_table_to_pandas_dataframe(
            pa_table, use_arrow_dtype=op.use_arrow_dtype)[:op.nrows]

        data = cls._align_output_data(op, data)

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is %s', data.shape)
        ctx[out.key] = data
Beispiel #3
0
    def testToPandas(self):
        rs = np.random.RandomState(0)
        df = pd.DataFrame({'a': rs.rand(100),
                           'b': ['s' + str(i) for i in rs.randint(100, size=100)]})

        batch_size = 15
        n_batch = len(df) // 15 + 1
        batches = [pa.RecordBatch.from_pandas(df[i * batch_size: (i + 1) * batch_size])
                   for i in range(n_batch)]
        table = pa.Table.from_batches(batches)

        df2 = arrow_table_to_pandas_dataframe(table)
        self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype())
        self.assertLess(df2.memory_usage(deep=True).sum(),
                        df.memory_usage(deep=True).sum())

        # test serialize
        df3 = dataserializer.loads(dataserializer.dumps(df2))
        self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype())
        pd.testing.assert_frame_equal(df3, df2)

        # test df method
        df4 = df2.groupby('b').sum()
        expected = df.groupby('b').sum()
        pd.testing.assert_frame_equal(df4, expected)

        s = ('s' + df2['b']).astype('string')
        expected = ('s' + df['b']).astype('string')
        pd.testing.assert_series_equal(s, expected)

        s2 = df2['b'].str[:2]
        expected = df['b'].astype('string').str[:2]
        pd.testing.assert_series_equal(s2, expected)
    def _execute_in_cupid(cls, ctx, op):
        import pyarrow as pa
        from cupid.io.table import TableSplit

        out = op.outputs[0]

        if op.cupid_handle is None:
            empty_df = pd.DataFrame()
            for name, dtype in out.dtypes.items():
                empty_df[name] = pd.Series(dtype=dtype)
            ctx[out.key] = empty_df
            return

        tsp = TableSplit(
            _handle=op.cupid_handle,
            _split_index=op.split_index,
            _split_file_start=op.split_file_start,
            _split_file_end=op.split_file_end,
            _schema_file_start=op.schema_file_start,
            _schema_file_end=op.schema_file_end,
        )
        logger.debug('Read split table, split index: %s', op.split_index)
        reader = tsp.open_arrow_reader()
        if op.nrows is None:
            arrow_table = reader.read_all()
        else:
            nrows = 0
            batches = []
            while nrows < op.nrows:
                try:
                    batch = reader.read_next_batch()
                    nrows += batch.num_rows
                    batches.append(batch)
                except StopIteration:
                    break
            logger.debug('Read %s rows of this split.', op.nrows)
            arrow_table = pa.Table.from_batches(batches)

        arrow_table = cls._append_partition_values(arrow_table, op)

        if op.string_as_binary:
            arrow_table = cls._cast_string_to_binary(arrow_table)
        data = arrow_table_to_pandas_dataframe(
            arrow_table, use_arrow_dtype=op.use_arrow_dtype)
        if op.nrows is not None:
            data = data[:op.nrows]

        data = cls._align_columns(data, out.dtypes)

        logger.debug('Read split table finished, split index: %s',
                     op.split_index)
        logger.debug('Split data shape is %s, size is %s', data.shape,
                     data.memory_usage(deep=True).sum())
        ctx[out.key] = data
Beispiel #5
0
def test_to_pandas():
    rs = np.random.RandomState(0)
    df = pd.DataFrame({
        'a':
        rs.rand(100),
        'b': ['s' + str(i) for i in rs.randint(100, size=100)],
        'c':
        [['ss0' + str(i), 'ss1' + str(i)] for i in rs.randint(100, size=100)]
    })

    batch_size = 15
    n_batch = len(df) // 15 + 1
    batches = [
        pa.RecordBatch.from_pandas(df[i * batch_size:(i + 1) * batch_size])
        for i in range(n_batch)
    ]
    table = pa.Table.from_batches(batches)

    df1 = arrow_table_to_pandas_dataframe(table, use_arrow_dtype=False)
    assert df1.dtypes.iloc[1] == np.dtype('O')
    assert df1.dtypes.iloc[2] == np.dtype('O')

    df2 = arrow_table_to_pandas_dataframe(table)
    assert df2.dtypes.iloc[1] == ArrowStringDtype()
    assert df2.dtypes.iloc[2] == ArrowListDtype(str)
    assert df2.memory_usage(deep=True).sum() < df.memory_usage(deep=True).sum()

    # test df method
    df4 = df2.groupby('b').sum()
    expected = df.groupby('b').sum()
    pd.testing.assert_frame_equal(df4, expected)

    s = ('s' + df2['b']).astype('string')
    expected = ('s' + df['b']).astype('string')
    pd.testing.assert_series_equal(s, expected)

    s2 = df2['b'].str[:2]
    expected = df['b'].astype('string').str[:2]
    pd.testing.assert_series_equal(s2, expected)
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)

        if op.partition_spec is not None:
            download_session = tunnel.create_download_session(
                t.name, partition_spec=op.partition_spec)
        else:
            download_session = tunnel.create_download_session(t.name)
        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        with download_session.open_arrow_reader(op.start_index,
                                                count,
                                                columns=op.columns) as reader:
            table = reader.read()

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)

        data = cls._align_columns(data, op.outputs[0].dtypes)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data
Beispiel #7
0
    def _execute_arrow_tunnel(cls, ctx, op):
        from odps import ODPS
        from odps.tunnel import TableTunnel

        out = op.outputs[0]

        if op.table_name is None:
            # is empty table
            ctx[out.key] = cls._build_empty_df(out)
            return

        project = os.environ.get('ODPS_PROJECT_NAME', None)
        odps_params = op.odps_params.copy()
        if project:
            odps_params['project'] = project
        endpoint = os.environ.get(
            'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint']
        o = ODPS(odps_params['access_id'],
                 odps_params['secret_access_key'],
                 project=odps_params['project'],
                 endpoint=endpoint)

        t = o.get_table(op.table_name)
        tunnel = TableTunnel(o, project=t.project)
        retry_times = op.retry_times or options.retry_times

        retries = 0
        while True:
            try:
                if op.partition_spec is not None:
                    download_session = tunnel.create_download_session(
                        t.name, partition_spec=op.partition_spec)
                else:
                    download_session = tunnel.create_download_session(t.name)
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        logger.debug('Start reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        if op.nrows is None:
            count = op.end_index - op.start_index
        else:
            count = op.nrows

        retries = 0
        while True:
            try:
                with download_session.open_arrow_reader(
                        op.start_index, count, columns=op.columns) as reader:
                    table = reader.read()
                break
            except:
                if retries >= retry_times:
                    raise
                retries += 1
                time.sleep(1)

        table = cls._append_partition_values(table, op)
        if op.string_as_binary:
            table = cls._cast_string_to_binary(table)
        data = arrow_table_to_pandas_dataframe(
            table, use_arrow_dtype=op.use_arrow_dtype)
        data = cls._align_output_data(op, data)

        logger.debug('Finish reading table %s(%s) split from %s to %s',
                     op.table_name, op.partition_spec, op.start_index,
                     op.end_index)
        ctx[op.outputs[0].key] = data