Exemple #1
0
class ArrowCoderImpl(FieldCoderImpl):
    """
    A coder for arrow format data.
    """

    def __init__(self, schema, row_type, timezone):
        self._schema = schema
        self._field_types = row_type.field_types()
        self._timezone = timezone
        self._resettable_io = ResettableIO()
        self._batch_reader = ArrowCoderImpl._load_from_stream(self._resettable_io)

    def encode_to_stream(self, cols, out_stream):
        self._resettable_io.set_output_stream(out_stream)
        batch_writer = pa.RecordBatchStreamWriter(self._resettable_io, self._schema)
        batch_writer.write_batch(
            pandas_to_arrow(self._schema, self._timezone, self._field_types, cols))

    def decode_from_stream(self, in_stream, length=0):
        return self.decode_one_batch_from_stream(in_stream, length)

    @staticmethod
    def _load_from_stream(stream):
        while stream.readable():
            reader = pa.ipc.open_stream(stream)
            yield reader.read_next_batch()

    def decode_one_batch_from_stream(self, in_stream: InputStream, size: int) -> List:
        self._resettable_io.set_input_bytes(in_stream.read(size))
        # there is only one arrow batch in the underlying input stream
        return arrow_to_pandas(self._timezone, self._field_types, [next(self._batch_reader)])

    def __repr__(self):
        return 'ArrowCoderImpl[%s]' % self._schema
Exemple #2
0
class ArrowCoderImpl(StreamCoderImpl):
    def __init__(self, schema):
        self._schema = schema
        self._resettable_io = ResettableIO()
        self._batch_reader = ArrowCoderImpl._load_from_stream(
            self._resettable_io)
        self._batch_writer = pa.RecordBatchStreamWriter(
            self._resettable_io, self._schema)
        self.data_out_stream = create_OutputStream()
        self._resettable_io.set_output_stream(self.data_out_stream)

    def encode_to_stream(self, iter_cols, out_stream, nested):
        data_out_stream = self.data_out_stream
        for cols in iter_cols:
            self._batch_writer.write_batch(self._create_batch(cols))
            out_stream.write_var_int64(data_out_stream.size())
            out_stream.write(data_out_stream.get())
            data_out_stream._clear()

    def decode_from_stream(self, in_stream, nested):
        while in_stream.size() > 0:
            yield self._decode_one_batch_from_stream(in_stream)

    @staticmethod
    def _load_from_stream(stream):
        reader = pa.ipc.open_stream(stream)
        for batch in reader:
            yield batch

    def _create_batch(self, cols):
        def create_array(s, t):
            try:
                return pa.Array.from_pandas(s, mask=s.isnull(), type=t)
            except pa.ArrowException as e:
                error_msg = "Exception thrown when converting pandas.Series (%s) to " \
                            "pyarrow.Array (%s)."
                raise RuntimeError(error_msg % (s.dtype, t), e)

        arrays = [
            create_array(cols[i], self._schema.types[i])
            for i in range(0, len(self._schema))
        ]
        return pa.RecordBatch.from_arrays(arrays, self._schema)

    def _decode_one_batch_from_stream(self,
                                      in_stream: create_InputStream) -> List:
        self._resettable_io.set_input_bytes(in_stream.read_all(True))
        # there is only one arrow batch in the underlying input stream
        table = pa.Table.from_batches([next(self._batch_reader)])
        return [c.to_pandas(date_as_object=True) for c in table.itercolumns()]

    def __repr__(self):
        return 'ArrowCoderImpl[%s]' % self._schema
Exemple #3
0
class ArrowCoderImpl(StreamCoderImpl):
    def __init__(self, schema, row_type, timezone):
        self._schema = schema
        self._field_types = row_type.field_types()
        self._timezone = timezone
        self._resettable_io = ResettableIO()
        self._batch_reader = ArrowCoderImpl._load_from_stream(
            self._resettable_io)
        self._batch_writer = pa.RecordBatchStreamWriter(
            self._resettable_io, self._schema)
        self.data_out_stream = create_OutputStream()
        self._resettable_io.set_output_stream(self.data_out_stream)

    def encode_to_stream(self, iter_cols, out_stream, nested):
        data_out_stream = self.data_out_stream
        for cols in iter_cols:
            self._batch_writer.write_batch(
                pandas_to_arrow(self._schema, self._timezone,
                                self._field_types, cols))
            out_stream.write_var_int64(data_out_stream.size())
            out_stream.write(data_out_stream.get())
            data_out_stream._clear()

    def decode_from_stream(self, in_stream, nested):
        while in_stream.size() > 0:
            yield self._decode_one_batch_from_stream(
                in_stream, in_stream.read_var_int64())

    @staticmethod
    def _load_from_stream(stream):
        reader = pa.ipc.open_stream(stream)
        for batch in reader:
            yield batch

    def _decode_one_batch_from_stream(self, in_stream: create_InputStream,
                                      size: int) -> List:
        self._resettable_io.set_input_bytes(in_stream.read(size))
        # there is only one arrow batch in the underlying input stream
        return arrow_to_pandas(self._timezone, self._field_types,
                               [next(self._batch_reader)])

    def __repr__(self):
        return 'ArrowCoderImpl[%s]' % self._schema
Exemple #4
0
class ArrowCoderImpl(StreamCoderImpl):
    def __init__(self, schema, row_type, timezone):
        self._schema = schema
        self._field_types = row_type.field_types()
        self._timezone = timezone
        self._resettable_io = ResettableIO()
        self._batch_reader = ArrowCoderImpl._load_from_stream(
            self._resettable_io)
        self._batch_writer = pa.RecordBatchStreamWriter(
            self._resettable_io, self._schema)
        self.data_out_stream = create_OutputStream()
        self._resettable_io.set_output_stream(self.data_out_stream)

    def encode_to_stream(self, iter_cols, out_stream, nested):
        data_out_stream = self.data_out_stream
        for cols in iter_cols:
            self._batch_writer.write_batch(self._create_batch(cols))
            out_stream.write_var_int64(data_out_stream.size())
            out_stream.write(data_out_stream.get())
            data_out_stream._clear()

    def decode_from_stream(self, in_stream, nested):
        while in_stream.size() > 0:
            yield self._decode_one_batch_from_stream(in_stream)

    @staticmethod
    def _load_from_stream(stream):
        reader = pa.ipc.open_stream(stream)
        for batch in reader:
            yield batch

    def _create_batch(self, cols):
        def create_array(s, t):
            try:
                return pa.Array.from_pandas(s, mask=s.isnull(), type=t)
            except pa.ArrowException as e:
                error_msg = "Exception thrown when converting pandas.Series (%s) to " \
                            "pyarrow.Array (%s)."
                raise RuntimeError(error_msg % (s.dtype, t), e)

        arrays = [
            create_array(
                ArrowCoderImpl.tz_convert_to_internal(cols[i],
                                                      self._field_types[i],
                                                      self._timezone),
                self._schema.types[i]) for i in range(0, len(self._schema))
        ]
        return pa.RecordBatch.from_arrays(arrays, self._schema)

    def _decode_one_batch_from_stream(self,
                                      in_stream: create_InputStream) -> List:
        self._resettable_io.set_input_bytes(in_stream.read_all(True))
        # there is only one arrow batch in the underlying input stream
        table = pa.Table.from_batches([next(self._batch_reader)])
        return [
            ArrowCoderImpl.tz_convert_from_internal(
                c.to_pandas(date_as_object=True), t, self._timezone)
            for c, t in zip(table.itercolumns(), self._field_types)
        ]

    @staticmethod
    def tz_convert_from_internal(s: pd.Series, t: DataType,
                                 local_tz) -> pd.Series:
        """
        Converts the timestamp series from internal according to the specified local timezone.

        Returns the same series if the series is not a timestamp series. Otherwise,
        returns a converted series.
        """
        if type(t) == LocalZonedTimestampType:
            return s.dt.tz_localize(local_tz)
        else:
            return s

    @staticmethod
    def tz_convert_to_internal(s: pd.Series, t: DataType,
                               local_tz) -> pd.Series:
        """
        Converts the timestamp series to internal according to the specified local timezone.
        """
        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        if type(t) == LocalZonedTimestampType:
            if is_datetime64_dtype(s.dtype):
                return s.dt.tz_localize(None)
            elif is_datetime64tz_dtype(s.dtype):
                return s.dt.tz_convert(local_tz).dt.tz_localize(None)
        return s

    def __repr__(self):
        return 'ArrowCoderImpl[%s]' % self._schema