def test_iteration_echo_io() -> None: read_stream, write_stream = iteration.echo_io() write_stream.write(b'hello\n') data = read_stream.read(6) assert data == b'hello\n' write_stream.write(b'world\n') data = read_stream.read(2) assert data == b'wo' data = read_stream.read(2) assert data == b'rl' write_stream.close() data = read_stream.read() assert data == b'd\n' data = read_stream.read() assert data == b''
def test_session_gen_stream() -> None: local_session = ck.LocalSession(stop=True) local_session.query('drop table if exists pyck_test') local_session.query('create table pyck_test (x Int64) engine = Memory') dataframe_1 = pandas.DataFrame({'x': pandas.RangeIndex(1000000)}) read_stream, write_stream = iteration.echo_io() join = local_session.query_stream_async( 'insert into pyck_test format CSVWithNames', stream_in=read_stream) dataframe_1.to_csv(io.TextIOWrapper(write_stream), index=False) join() read_stream, write_stream = iteration.echo_io() join = local_session.query_stream_async( 'select * from pyck_test format CSVWithNames', stream_out=write_stream) dataframe_2 = pandas.read_csv(io.TextIOWrapper(read_stream)) join() assert dataframe_2.x.to_list() == dataframe_1.x.to_list() local_session.query('drop table pyck_test')
def query_pandas_async( self, query: str, dataframe: typing.Optional[pandas.DataFrame] = None, method: typing.Optional[typing_extensions.Literal['tcp', 'http', 'ssh']] = None, settings: typing.Optional[typing.Dict[str, str]] = None, join_interval: float = 0.1 ) -> typing.Callable[[], typing.Optional[pandas.DataFrame]]: batch = None error = None # prepare read_stream, write_stream = iteration.echo_io() if dataframe is None: gen_in = iteration.empty_in() gen_out = iteration.stream_out(write_stream) else: gen_in = iteration.stream_in(read_stream) gen_out = iteration.empty_out() raw_join = self._run(f'{query} format ArrowStream', gen_in, gen_out, method, settings) # create thread def handle_batch() -> None: nonlocal dataframe nonlocal batch nonlocal error try: if dataframe is None: batch = pyarrow.RecordBatchStreamReader(read_stream) dataframe = batch.read_pandas() else: table = pyarrow.Table.from_pandas(dataframe) batch = pyarrow.RecordBatchStreamWriter( write_stream, table.schema) batch.write_table(table) dataframe = None batch.close() write_stream.close() except BaseException as raw_error: # pylint: disable=broad-except error = raw_error thread = threading.Thread(target=handle_batch) thread.start() # join thread def join() -> typing.Optional[pandas.DataFrame]: while error is None and thread.is_alive(): thread.join(join_interval) if error is not None: raise error # pylint: disable=raising-bad-type raw_join() return dataframe return join
def query_pandas_async( self, query: str, dataframe: typing.Optional[pandas.DataFrame] = None, encoding: typing.Optional[str] = 'utf-8', method: typing.Optional[typing_extensions.Literal['tcp', 'http', 'ssh']] = None, settings: typing.Optional[typing.Dict[str, str]] = None, join_interval: float = 0.1 ) -> typing.Callable[[], typing.Optional[pandas.DataFrame]]: batch = None error = None # prepare read_stream, write_stream = iteration.echo_io() if dataframe is None: gen_in = iteration.empty_in() gen_out = iteration.stream_out(write_stream) else: gen_in = iteration.stream_in(read_stream) gen_out = iteration.empty_out() raw_join = self._run(f'{query} format ArrowStream', gen_in, gen_out, method, settings) # create thread def handle_batch() -> None: nonlocal dataframe nonlocal batch nonlocal error try: if dataframe is None: batch = pyarrow.RecordBatchStreamReader(read_stream) dataframe = batch.read_pandas() if encoding is not None: def decode(value: typing.Any) -> typing.Any: if type(value) is bytes: assert encoding is not None return value.decode(encoding) if type(value) is bytearray: assert encoding is not None return value.decode(encoding) if type(value) is tuple: return tuple(decode(child) for child in value) if type(value) is list: return [decode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [decode(child) for child in value]) if type(value) is set: return {decode(child) for child in value} if type(value) is frozenset: return frozenset( decode(child) for child in value) if type(value) is dict: return { key: decode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(decode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) else: if encoding is not None: def encode(value: typing.Any) -> typing.Any: if type(value) is str: assert encoding is not None return value.encode(encoding) if type(value) is tuple: return tuple(encode(child) for child in value) if type(value) is list: return [encode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [encode(child) for child in value]) if type(value) is set: return {encode(child) for child in value} if type(value) is frozenset: return frozenset( encode(child) for child in value) if type(value) is dict: return { key: encode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(encode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) table = pyarrow.Table.from_arrays([ pyarrow.array(dataframe[column].values) for column in dataframe ], dataframe.columns) batch = pyarrow.RecordBatchStreamWriter( write_stream, table.schema) batch.write_table(table) dataframe = None batch.close() write_stream.close() except pyarrow.ArrowInvalid: pass except BaseException as raw_error: # pylint: disable=broad-except error = raw_error thread = threading.Thread(target=handle_batch) thread.start() # join thread def join() -> typing.Optional[pandas.DataFrame]: while error is None and thread.is_alive(): thread.join(join_interval) if error is not None: raise error # pylint: disable=raising-bad-type raw_join() return dataframe return join