Ejemplo n.º 1
0
def test_iteration_stream_out() -> None:
    gen_out = iteration.stream_out(open('/tmp/pyck_test_iteration_3', 'wb'))
    next(gen_out)
    gen_out.send(b'world\n')
    gen_out.send(b'')

    assert open('/tmp/pyck_test_iteration_3', 'rb').read() == b'world\n'
Ejemplo n.º 2
0
def test_iteration_io_out() -> None:
    gen_out = iteration.stream_out(open('/tmp/pyck_test_iteration_3', 'wb'))
    next(gen_out)
    gen_out.send(b'world\n')
    gen_out.send(b'')
    assert open('/tmp/pyck_test_iteration_3', 'rb').read() == b'world\n'

    # TODO: test pipe_out

    gen_out = iteration.file_out('/tmp/pyck_test_iteration_4')
    next(gen_out)
    gen_out.send(b'world\n')
    gen_out.send(b'')
    assert open('/tmp/pyck_test_iteration_4', 'rb').read() == b'world\n'
Ejemplo n.º 3
0
    def query_stream_async(
        self,
        query: str,
        stream_in: typing.Optional[typing.BinaryIO] = None,
        stream_out: typing.Optional[typing.BinaryIO] = None,
        method: typing.Optional[typing_extensions.Literal['tcp', 'http',
                                                          'ssh']] = None,
        settings: typing.Optional[typing.Dict[str, str]] = None
    ) -> typing.Callable[[], None]:
        if stream_in is None:
            gen_in = iteration.empty_in()
        else:
            gen_in = iteration.stream_in(stream_in)

        if stream_out is None:
            gen_out = iteration.empty_out()
        else:
            gen_out = iteration.stream_out(stream_out)

        return self._run(query, gen_in, gen_out, method, settings)
Ejemplo n.º 4
0
def test_session_pandas() -> None:
    local_session = ck.LocalSession()

    local_session.query('drop table if exists pyck_test')
    local_session.query('create table pyck_test (x String) engine = Memory')

    dataframe_1 = pandas.DataFrame({'x': pandas.RangeIndex(1000000)})

    read_stream, write_stream = iteration.echo_io()
    join = local_session.query_async(
        'insert into pyck_test format CSVWithNames',
        gen_in=iteration.stream_in(read_stream))
    dataframe_1.to_csv(io.TextIOWrapper(write_stream), index=False)
    join()
    read_stream, write_stream = iteration.echo_io()
    join = local_session.query_async(
        'select * from pyck_test format CSVWithNames',
        gen_out=iteration.stream_out(write_stream))
    dataframe_2 = pandas.read_csv(io.TextIOWrapper(read_stream))
    join()
    assert dataframe_2.x.to_list() == dataframe_1.x.to_list()

    local_session.query('drop table pyck_test')
Ejemplo n.º 5
0
    def query_pandas_async(
        self,
        query: str,
        dataframe: typing.Optional[pandas.DataFrame] = None,
        method: typing.Optional[typing_extensions.Literal['tcp', 'http',
                                                          'ssh']] = None,
        settings: typing.Optional[typing.Dict[str, str]] = None,
        join_interval: float = 0.1
    ) -> typing.Callable[[], typing.Optional[pandas.DataFrame]]:
        batch = None
        error = None

        # prepare

        read_stream, write_stream = iteration.echo_io()

        if dataframe is None:
            gen_in = iteration.empty_in()
            gen_out = iteration.stream_out(write_stream)
        else:
            gen_in = iteration.stream_in(read_stream)
            gen_out = iteration.empty_out()

        raw_join = self._run(f'{query} format ArrowStream', gen_in, gen_out,
                             method, settings)

        # create thread

        def handle_batch() -> None:
            nonlocal dataframe
            nonlocal batch
            nonlocal error

            try:
                if dataframe is None:
                    batch = pyarrow.RecordBatchStreamReader(read_stream)
                    dataframe = batch.read_pandas()
                else:
                    table = pyarrow.Table.from_pandas(dataframe)
                    batch = pyarrow.RecordBatchStreamWriter(
                        write_stream, table.schema)
                    batch.write_table(table)
                    dataframe = None
                    batch.close()
                    write_stream.close()

            except BaseException as raw_error:  # pylint: disable=broad-except
                error = raw_error

        thread = threading.Thread(target=handle_batch)

        thread.start()

        # join thread

        def join() -> typing.Optional[pandas.DataFrame]:
            while error is None and thread.is_alive():
                thread.join(join_interval)

            if error is not None:
                raise error  # pylint: disable=raising-bad-type

            raw_join()

            return dataframe

        return join
Ejemplo n.º 6
0
    def query_pandas_async(
        self,
        query: str,
        dataframe: typing.Optional[pandas.DataFrame] = None,
        encoding: typing.Optional[str] = 'utf-8',
        method: typing.Optional[typing_extensions.Literal['tcp', 'http',
                                                          'ssh']] = None,
        settings: typing.Optional[typing.Dict[str, str]] = None,
        join_interval: float = 0.1
    ) -> typing.Callable[[], typing.Optional[pandas.DataFrame]]:
        batch = None
        error = None

        # prepare

        read_stream, write_stream = iteration.echo_io()

        if dataframe is None:
            gen_in = iteration.empty_in()
            gen_out = iteration.stream_out(write_stream)
        else:
            gen_in = iteration.stream_in(read_stream)
            gen_out = iteration.empty_out()

        raw_join = self._run(f'{query} format ArrowStream', gen_in, gen_out,
                             method, settings)

        # create thread

        def handle_batch() -> None:
            nonlocal dataframe
            nonlocal batch
            nonlocal error

            try:
                if dataframe is None:
                    batch = pyarrow.RecordBatchStreamReader(read_stream)
                    dataframe = batch.read_pandas()

                    if encoding is not None:

                        def decode(value: typing.Any) -> typing.Any:
                            if type(value) is bytes:
                                assert encoding is not None

                                return value.decode(encoding)

                            if type(value) is bytearray:
                                assert encoding is not None

                                return value.decode(encoding)

                            if type(value) is tuple:
                                return tuple(decode(child) for child in value)

                            if type(value) is list:
                                return [decode(child) for child in value]

                            if type(value) is numpy.ndarray:
                                return numpy.array(
                                    [decode(child) for child in value])

                            if type(value) is set:
                                return {decode(child) for child in value}

                            if type(value) is frozenset:
                                return frozenset(
                                    decode(child) for child in value)

                            if type(value) is dict:
                                return {
                                    key: decode(child)
                                    for key, child in value.items()
                                }

                            return value

                        dataframe = pandas.DataFrame({
                            column: (dataframe[column].apply(decode)
                                     if dataframe[column].dtype == 'O' else
                                     dataframe[column])
                            for column in dataframe
                        })
                else:
                    if encoding is not None:

                        def encode(value: typing.Any) -> typing.Any:
                            if type(value) is str:
                                assert encoding is not None

                                return value.encode(encoding)

                            if type(value) is tuple:
                                return tuple(encode(child) for child in value)

                            if type(value) is list:
                                return [encode(child) for child in value]

                            if type(value) is numpy.ndarray:
                                return numpy.array(
                                    [encode(child) for child in value])

                            if type(value) is set:
                                return {encode(child) for child in value}

                            if type(value) is frozenset:
                                return frozenset(
                                    encode(child) for child in value)

                            if type(value) is dict:
                                return {
                                    key: encode(child)
                                    for key, child in value.items()
                                }

                            return value

                        dataframe = pandas.DataFrame({
                            column: (dataframe[column].apply(encode)
                                     if dataframe[column].dtype == 'O' else
                                     dataframe[column])
                            for column in dataframe
                        })

                    table = pyarrow.Table.from_arrays([
                        pyarrow.array(dataframe[column].values)
                        for column in dataframe
                    ], dataframe.columns)
                    batch = pyarrow.RecordBatchStreamWriter(
                        write_stream, table.schema)
                    batch.write_table(table)
                    dataframe = None
                    batch.close()
                    write_stream.close()
            except pyarrow.ArrowInvalid:
                pass
            except BaseException as raw_error:  # pylint: disable=broad-except
                error = raw_error

        thread = threading.Thread(target=handle_batch)

        thread.start()

        # join thread

        def join() -> typing.Optional[pandas.DataFrame]:
            while error is None and thread.is_alive():
                thread.join(join_interval)

            if error is not None:
                raise error  # pylint: disable=raising-bad-type

            raw_join()

            return dataframe

        return join