Esempio n. 1
0
def test_fill_null():
    arr = pa.array([1, 2, None, 4], type=pa.int8())
    fill_value = pa.array([5], type=pa.int8())
    with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
        arr.fill_null(fill_value)

    arr = pa.array([None, None, None, None], type=pa.null())
    fill_value = pa.scalar(None, type=pa.null())
    result = arr.fill_null(fill_value)
    expected = pa.array([None, None, None, None])
    assert result.equals(expected)

    arr = pa.array(['a', 'bb', None])
    result = arr.fill_null('ccc')
    expected = pa.array(['a', 'bb', 'ccc'])
    assert result.equals(expected)

    arr = pa.array([b'a', b'bb', None], type=pa.large_binary())
    result = arr.fill_null('ccc')
    expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary())
    assert result.equals(expected)

    arr = pa.array(['a', 'bb', None])
    result = arr.fill_null(None)
    expected = pa.array(['a', 'bb', None])
    assert result.equals(expected)
Esempio n. 2
0
  def _ValidateRecordBatch(self,
                           record_batch,
                           raw_record_column_name=None):
    self.assertIsInstance(record_batch, pa.RecordBatch)
    self.assertEqual(record_batch.num_rows, 2)
    expected_schema = _EXPECTED_ARROW_SCHEMA
    if raw_record_column_name is not None:
      expected_schema = pa.schema(
          list(expected_schema) +
          [pa.field(raw_record_column_name, pa.large_list(pa.large_binary()))])
    self.assertTrue(
        record_batch.schema.equals(expected_schema),
        "Expected: {} ; got {}".format(expected_schema,
                                       record_batch.schema))
    for i, field in enumerate(record_batch.schema):
      if field.name == raw_record_column_name:
        continue
      self.assertTrue(
          record_batch.column(i).equals(_EXPECTED_COLUMN_VALUES[field.name]),
          "Column {} did not match ({} vs {}).".format(
              field.name, record_batch.column(i),
              _EXPECTED_COLUMN_VALUES[field.name]))

    if raw_record_column_name is not None:
      self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
      self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                       _RAW_RECORDS)
  def testConvertToRecordBatchPassthroughData(self):
    passthrough_key1 = '__passthrough_with_batch_length__'
    passthrough_key2 = '__passthrough_with_one_value__'
    passthrough_key3 = '__passthrough_with_one_distinct_value_none__'
    passthrough_key4 = '__passthrough_with_one_distinct_value_not_none__'
    batch_dict = {
        'a':
            np.array([100, 1, 10], np.int64),
        passthrough_key1:
            pa.array([[1], None, [0]], pa.large_list(pa.int64())),
        passthrough_key2:
            pa.array([None], pa.large_list(pa.float32())),
        passthrough_key3:
            pa.array([None, None], pa.large_list(pa.large_binary())),
        passthrough_key4:
            pa.array([[10], [10]], pa.large_list(pa.int64()))
    }
    schema = schema_utils.schema_from_feature_spec(
        {'a': tf.io.FixedLenFeature([], tf.int64)})
    converter = impl_helper.make_tensor_to_arrow_converter(schema)
    passthrough_keys = {
        passthrough_key1, passthrough_key2, passthrough_key3, passthrough_key4
    }
    arrow_schema = pa.schema([
        ('a', pa.large_list(pa.int64())),
        (passthrough_key1, batch_dict[passthrough_key1].type),
        (passthrough_key2, batch_dict[passthrough_key2].type),
        (passthrough_key3, batch_dict[passthrough_key3].type),
        (passthrough_key4, batch_dict[passthrough_key4].type)
    ])
    # Note that we only need `input_metadata.arrow_schema`.
    input_metadata = TensorAdapterConfig(arrow_schema, {})
    record_batch, unary_features = impl._convert_to_record_batch(
        batch_dict, schema, converter, passthrough_keys, input_metadata)
    expected_record_batch = {
        'a': [[100], [1], [10]],
        passthrough_key1: [[1], None, [0]]
    }
    self.assertDictEqual(expected_record_batch, record_batch.to_pydict())
    expected_unary_features = {
        passthrough_key2: [None],
        passthrough_key3: [None],
        passthrough_key4: [[10]]
    }
    unary_features = {k: v.to_pylist() for k, v in unary_features.items()}
    self.assertDictEqual(expected_unary_features, unary_features)

    # Test pass-through data when input and output batch sizes are different and
    # the number of its unique values is >1.
    passthrough_key5 = '__passthrough_with_wrong_batch_size__'
    passthrough_keys.add(passthrough_key5)
    batch_dict[passthrough_key5] = pa.array([[1], [2]],
                                            pa.large_list(pa.int64()))
    input_metadata.arrow_schema = input_metadata.arrow_schema.append(
        pa.field(passthrough_key5, batch_dict[passthrough_key5].type))
    with self.assertRaisesRegexp(
        ValueError, 'Cannot pass-through data when '
        'input and output batch sizes are different'):
      _ = impl._convert_to_record_batch(batch_dict, schema, converter,
                                        passthrough_keys, input_metadata)
Esempio n. 4
0
def _make_record_batch(num_cols, num_rows):
    columns = [
        pa.array([[b"kk"]] * num_rows, type=pa.large_list(pa.large_binary()))
        for _ in range(num_cols)
    ]
    column_names = ["col%d" % c for c in range(num_cols)]
    return pa.record_batch(columns, column_names)
Esempio n. 5
0
    def _ValidateRecordBatch(self, record_batch, raw_record_column_name=None):
        self.assertIsInstance(record_batch, pa.RecordBatch)
        self.assertEqual(record_batch.num_rows, 3)
        for i, field in enumerate(record_batch.schema):
            if field.name == raw_record_column_name:
                continue
            if field.name == _SEQUENCE_COLUMN_NAME:
                self.assertTrue(pa.types.is_struct(field.type))
                for seq_column, seq_field in zip(
                        record_batch.column(i).flatten(), list(field.type)):
                    expected_array = _EXPECTED_COLUMN_VALUES[path.ColumnPath(
                        [_SEQUENCE_COLUMN_NAME, seq_field.name])]
                    self.assertTrue(
                        seq_column.equals(expected_array),
                        "Sequence column {} did not match ({} vs {})".format(
                            seq_field.name, seq_column, expected_array))
                continue
            self.assertTrue(
                record_batch.column(i).equals(
                    _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]),
                "Column {} did not match ({} vs {}).".format(
                    field.name, record_batch.column(i),
                    _EXPECTED_COLUMN_VALUES[path.ColumnPath([field.name])]))

        if raw_record_column_name is not None:
            self.assertEqual(record_batch.schema.names[-1],
                             raw_record_column_name)
            self.assertTrue(record_batch.columns[-1].type.equals(
                pa.large_list(pa.large_binary())))
            self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                             _SERIALIZED_EXAMPLES)
Esempio n. 6
0
def _LargeBinaryCanBeDictEncoded() -> bool:
  """Returns True if a large binary array can be dictionary encoded."""
  try:
    pa.array([], type=pa.large_binary()).dictionary_encode()
  except:  # pylint:disable=bare-except
    return False
  return True
Esempio n. 7
0
def _get_binary_like_byte_size_test_cases():
  result = []
  for array_type, sizeof_offsets in [
      (pa.binary(), 4),
      (pa.string(), 4),
      (pa.large_binary(), 8),
      (pa.large_string(), 8),
  ]:
    result.append(
        dict(
            testcase_name=str(array_type),
            array=pa.array([
                "a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg",
                "hhhhhhhh", "iiiiiiiii"
            ], type=array_type),
            slice_offset=1,
            slice_length=3,
            # contents: 45
            # offsets: 10 * sizeof_offsets
            # null bitmap: 2
            expected_size=(45 + sizeof_offsets * 10 + 2),
            # contents: 9
            # offsets: 4 * sizeof_offsets
            # null bitmap: 1
            expected_sliced_size=(9 + sizeof_offsets * 4 + 1)))
  return result
Esempio n. 8
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.large_string(), pa.large_binary(),
            pa.list_(pa.int32()), pa.large_list(pa.uint16()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.string()))
Esempio n. 9
0
def test_large_binary():
    data = [b'foo', b'bar'] * 50
    for type in [pa.large_binary(), pa.large_string()]:
        arr = pa.array(data, type=type)
        table = pa.Table.from_arrays([arr], names=['strs'])
        for use_dictionary in [False, True]:
            _check_roundtrip(table, use_dictionary=use_dictionary)
Esempio n. 10
0
    def test_decode_large_types(self, schema_text_proto, examples_text_proto,
                                create_expected):
        serialized_examples = [
            text_format.Parse(pbtxt, tf.train.Example()).SerializeToString()
            for pbtxt in examples_text_proto
        ]
        serialized_schema = None
        if schema_text_proto is not None:
            serialized_schema = text_format.Parse(
                schema_text_proto, schema_pb2.Schema()).SerializeToString()

        if serialized_schema:
            coder = example_coder.ExamplesToRecordBatchDecoder(
                serialized_schema=serialized_schema, use_large_types=True)
        else:
            coder = example_coder.ExamplesToRecordBatchDecoder(
                use_large_types=True)

        result = coder.DecodeBatch(serialized_examples)
        self.assertIsInstance(result, pa.RecordBatch)
        expected = create_expected(pa.large_list, pa.large_binary())
        self.assertTrue(result.equals(expected),
                        "actual: {}\n expected:{}".format(result, expected))
        if serialized_schema:
            self.assertTrue(expected.schema.equals(coder.ArrowSchema()))
Esempio n. 11
0
def _LargeBinaryCanBeValueCounted() -> bool:
  """Returns True if a large binary array can be value counted."""
  try:
    array_util.ValueCounts(pa.array([], type=pa.large_binary()))
  except:  # pylint:disable=bare-except
    return False
  return True
Esempio n. 12
0
def test_type_ids():
    # Having this fixed is very important because internally we rely on this id
    # to parse from python
    for idx, arrow_type in [
        (0, pa.null()),
        (1, pa.bool_()),
        (2, pa.uint8()),
        (3, pa.int8()),
        (4, pa.uint16()),
        (5, pa.int16()),
        (6, pa.uint32()),
        (7, pa.int32()),
        (8, pa.uint64()),
        (9, pa.int64()),
        (10, pa.float16()),
        (11, pa.float32()),
        (12, pa.float64()),
        (13, pa.string()),
        (13, pa.utf8()),
        (14, pa.binary()),
        (16, pa.date32()),
        (17, pa.date64()),
        (18, pa.timestamp("us")),
        (19, pa.time32("s")),
        (20, pa.time64("us")),
        (23, pa.decimal128(8, 1)),
        (34, pa.large_utf8()),
        (35, pa.large_binary()),
    ]:
        assert idx == arrow_type.id
    def testIsBinaryLike(self):
        for t in (pa.binary(), pa.large_binary(), pa.string(),
                  pa.large_string()):
            self.assertTrue(arrow_util.is_binary_like(t))

        for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
            self.assertFalse(arrow_util.is_binary_like(t))
Esempio n. 14
0
    def _test_decode(self, schema_text_proto, sequence_examples_text_proto,
                     create_expected, use_large_types):
        serialized_sequence_examples = [
            text_format.Parse(pbtxt,
                              tf.train.SequenceExample()).SerializeToString()
            for pbtxt in sequence_examples_text_proto
        ]
        serialized_schema = None
        if schema_text_proto is not None:
            serialized_schema = text_format.Parse(
                schema_text_proto, schema_pb2.Schema()).SerializeToString()

        if serialized_schema:
            coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
                _TEST_SEQUENCE_COLUMN_NAME,
                serialized_schema,
                use_large_types=use_large_types)
        else:
            coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
                _TEST_SEQUENCE_COLUMN_NAME, use_large_types=use_large_types)

        result = coder.DecodeBatch(serialized_sequence_examples)
        self.assertIsInstance(result, pa.RecordBatch)
        if use_large_types:
            expected = create_expected(pa.large_list, pa.large_binary())
        else:
            expected = create_expected(pa.list_, pa.binary())
        self.assertTrue(result.equals(expected),
                        "actual: {}\n expected:{}".format(result, expected))

        if serialized_schema is not None:
            self.assertTrue(coder.ArrowSchema().equals(result.schema))
    def _ValidateRecordBatch(self,
                             tfxio,
                             record_batch,
                             raw_record_column_name=None):
        self.assertIsInstance(record_batch, pa.RecordBatch)
        self.assertEqual(record_batch.num_rows, 3)
        expected_column_values = GetExpectedColumnValues(tfxio)
        for i, field in enumerate(record_batch.schema):
            if field.name == raw_record_column_name:
                continue
            self.assertTrue(
                record_batch.column(i).equals(
                    expected_column_values[field.name]),
                "Column {} did not match ({} vs {}).".format(
                    field.name, record_batch.column(i),
                    expected_column_values[field.name]))

        if raw_record_column_name is not None:
            if tfxio._can_produce_large_types:
                raw_record_column_type = pa.large_list(pa.large_binary())
            else:
                raw_record_column_type = pa.list_(pa.binary())
            self.assertEqual(record_batch.schema.names[-1],
                             raw_record_column_name)
            self.assertTrue(
                record_batch.columns[-1].type.equals(raw_record_column_type))
            self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                             _SERIALIZED_EXAMPLES)
Esempio n. 16
0
 def test_large_binary(self):
     array = pyarrow.array(
         [b"1111", b"2222", b"3333"],
         pyarrow.large_binary(),
         numpy.array([False, True, False]),
     )
     self._test_data(array)
Esempio n. 17
0
    def testRecordBatchAndTensorAdapter(self):
        column_name = "raw_record"
        telemetry_descriptors = ["some", "component"]
        tfxio = raw_tf_record.RawTfRecordTFXIO(
            self._raw_record_file,
            column_name,
            telemetry_descriptors=telemetry_descriptors)
        expected_type = (pa.large_list(pa.large_binary()) if
                         _ProducesLargeTypes(tfxio) else pa.list_(pa.binary()))

        got_schema = tfxio.ArrowSchema()
        self.assertTrue(
            got_schema.equals(pa.schema([pa.field(column_name,
                                                  expected_type)])),
            "got: {}".format(got_schema))

        def _AssertFn(record_batches):
            self.assertLen(record_batches, 1)
            record_batch = record_batches[0]
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            self.assertTrue(record_batch.columns[0].equals(
                pa.array([[r] for r in _RAW_RECORDS], type=expected_type)))
            tensor_adapter = tfxio.TensorAdapter()
            tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(tensors, 1)
            self.assertIn(column_name, tensors)

        p = beam.Pipeline()
        record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            telemetry_descriptors, "bytes",
                                            "tfrecords_gzip")
Esempio n. 18
0
def _tf_dtype_to_arrow_type(dtype: tf.DType):
  """Maps a tf Dtype to an Arrow type."""
  if dtype == tf.string:
    return pa.large_binary()
  elif dtype == tf.bool:
    raise TypeError("Unable to handle bool tensors -- consider casting it to a "
                    "tf.uint8")
  return pa.from_numpy_dtype(dtype.as_numpy_dtype)
Esempio n. 19
0
def _GetConvertToBinaryFn(
        array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]:
    """Returns a function that converts a StringArray to BinaryArray."""

    if pa.types.is_string(array_type):
        return lambda array: array.view(pa.binary())
    if pa.types.is_large_string(array_type):
        return lambda array: array.view(pa.large_binary())
    return None
Esempio n. 20
0
def CreateRawRecordColumn(raw_records: List[bytes],
                          produce_large_types: bool) -> pa.Array:
    """Returns an Array that satisfies the requirement of a raw record column."""
    list_array_factory = (pa.LargeListArray.from_arrays
                          if produce_large_types else pa.ListArray.from_arrays)
    binary_type = pa.large_binary() if produce_large_types else pa.binary()
    return list_array_factory(
        np.arange(0, len(raw_records) + 1, dtype=np.int64),
        pa.array(raw_records, type=binary_type))
Esempio n. 21
0
def test_large_binary_huge():
    s = b'xy' * 997
    data = [s] * ((1 << 33) // len(s))
    for type in [pa.large_binary(), pa.large_string()]:
        arr = pa.array(data, type=type)
        table = pa.Table.from_arrays([arr], names=['strs'])
        for use_dictionary in [False, True]:
            _check_roundtrip(table, use_dictionary=use_dictionary)
        del arr, table
Esempio n. 22
0
 def test_query_with_all_supported_types(self):
     record_batch = pa.RecordBatch.from_arrays([
         pa.array([[1], [2]], type=pa.list_(pa.int32())),
         pa.array([[10], [20]], type=pa.list_(pa.int64())),
         pa.array([[1.1], [2.2]], type=pa.list_(pa.float32())),
         pa.array([[10.1], [20.2]], type=pa.list_(pa.float64())),
         pa.array([['a'], ['b']], type=pa.list_(pa.string())),
         pa.array([['a+'], ['b+']], type=pa.list_(pa.large_string())),
         pa.array([[b'a_bytes'], [b'b_bytes']], type=pa.list_(pa.binary())),
         pa.array([[b'a_bytes+'], [b'b_bytes+']],
                  type=pa.list_(pa.large_binary())),
     ], [
         'int32_list',
         'int64_list',
         'float32_list',
         'float64_list',
         'string_list',
         'large_string_list',
         'binary_list',
         'large_binary_list',
     ])
     sql = """
   SELECT
     ARRAY(
       SELECT
         STRUCT(int32_list, int64_list,
           float32_list, float64_list,
           string_list, large_string_list,
           binary_list, large_binary_list)
       FROM
         example.int32_list,
         example.int64_list,
         example.float32_list,
         example.float64_list,
         example.string_list,
         example.large_string_list,
         example.binary_list,
         example.large_binary_list
     ) as slice_key
   FROM Examples as example;"""
     query = sql_util.RecordBatchSQLSliceQuery(sql, record_batch.schema)
     slices = query.Execute(record_batch)
     self.assertEqual(slices, [[[('int32_list', '1'), ('int64_list', '10'),
                                 ('float32_list', '1.1'),
                                 ('float64_list', '10.1'),
                                 ('string_list', 'a'),
                                 ('large_string_list', 'a+'),
                                 ('binary_list', 'a_bytes'),
                                 ('large_binary_list', 'a_bytes+')]],
                               [[('int32_list', '2'), ('int64_list', '20'),
                                 ('float32_list', '2.2'),
                                 ('float64_list', '20.2'),
                                 ('string_list', 'b'),
                                 ('large_string_list', 'b+'),
                                 ('binary_list', 'b_bytes'),
                                 ('large_binary_list', 'b_bytes+')]]])
    def test_simple(self, attach_raw_records):
        raw_record_column_name = "_raw_records" if attach_raw_records else None
        tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
            self._input_path,
            self._decoder_path,
            _TELEMETRY_DESCRIPTORS,
            raw_record_column_name=raw_record_column_name)
        expected_fields = [
            pa.field("st1", pa.list_(pa.binary())),
            pa.field("st2", pa.list_(pa.binary())),
        ]
        if attach_raw_records:
            raw_record_column_type = (pa.large_list(pa.large_binary())
                                      if tfxio._can_produce_large_types else
                                      pa.list_(pa.binary()))
            expected_fields.append(
                pa.field(raw_record_column_name, raw_record_column_type))
        self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)),
                        tfxio.ArrowSchema())
        self.assertEqual(
            tfxio.TensorRepresentations(), {
                "st1":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st1" }""",
                    schema_pb2.TensorRepresentation()),
                "st2":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st2" }""",
                    schema_pb2.TensorRepresentation())
            })

        tensor_adapter = tfxio.TensorAdapter()
        self.assertEqual(tensor_adapter.TypeSpecs(),
                         _DecoderForTesting().output_type_specs())

        def _assert_fn(list_of_rb):
            self.assertLen(list_of_rb, 1)
            rb = list_of_rb[0]
            self.assertTrue(rb.schema.equals(tfxio.ArrowSchema()))
            tensors = tensor_adapter.ToBatchTensors(rb)
            self.assertLen(tensors, 2)
            for tensor_name in ("st1", "st2"):
                self.assertIn(tensor_name, tensors)
                st = tensors[tensor_name]
                self.assertAllEqual(st.values, _RECORDS)
                self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
                self.assertAllEqual(st.dense_shape, [2, 1])

        p = beam.Pipeline()
        rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS))
        beam_testing_util.assert_that(rb_pcoll, _assert_fn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "tensor",
                                            "tfrecords_gzip")
Esempio n. 24
0
def _tf_dtype_to_arrow_type(dtype: tf.DType) -> pa.DataType:
    """Maps a tf data type to a pyarrow data type."""
    if dtype == tf.string:
        return pa.large_binary()
    elif dtype == tf.int64:
        return pa.int64()
    elif dtype == tf.float32:
        return pa.float32()
    else:
        raise TypeError('Unable to handle data type {}'.format(dtype))
Esempio n. 25
0
def _resize_arrow_type(t):
    if t == pa.string():
        return pa.large_string()
    if t == pa.utf8():
        return pa.large_utf8()
    if t == pa.binary():
        return pa.large_binary()
    if isinstance(t, pa.lib.ListType):
        return pa.large_list(t.value_type)
    return t
Esempio n. 26
0
def test_large_binary_overflow():
    s = b'x' * (1 << 31)
    arr = pa.array([s], type=pa.large_binary())
    table = pa.Table.from_arrays([arr], names=['strs'])
    for use_dictionary in [False, True]:
        writer = pa.BufferOutputStream()
        with pytest.raises(
                pa.ArrowInvalid,
                match="Parquet cannot store strings with size 2GB or more"):
            _write_table(table, writer, use_dictionary=use_dictionary)
Esempio n. 27
0
 def ArrowSchema(self) -> pa.Schema:
   schema = self._ArrowSchemaNoRawRecordColumn()
   if self._raw_record_column_name is not None:
     if schema.get_field_index(self._raw_record_column_name) != -1:
       raise ValueError(
           "Raw record column name {} collided with a column in the schema."
           .format(self._raw_record_column_name))
     schema = schema.append(
         pa.field(self._raw_record_column_name,
                  pa.large_list(pa.large_binary())))
   return schema
Esempio n. 28
0
def test_sequence_bytes():
    u1 = b'ma\xc3\xb1ana'
    data = [b'foo',
            u1.decode('utf-8'),  # unicode gets encoded,
            bytearray(b'bar'),
            None]
    for ty in [None, pa.binary(), pa.large_binary()]:
        arr = pa.array(data, type=ty)
        assert len(arr) == 4
        assert arr.null_count == 1
        assert arr.type == ty or pa.binary()
        assert arr.to_pylist() == [b'foo', u1, b'bar', None]
def _map_arrow_type(arrow_type):
    arrow_to_dh = {
        pa.null(): '',
        pa.bool_(): '',
        pa.int8(): 'byte',
        pa.int16(): 'short',
        pa.int32(): 'int',
        pa.int64(): 'long',
        pa.uint8(): '',
        pa.uint16(): 'char',
        pa.uint32(): '',
        pa.uint64(): '',
        pa.float16(): '',
        pa.float32(): 'float',
        pa.float64(): 'double',
        pa.time32('s'): '',
        pa.time32('ms'): '',
        pa.time64('us'): '',
        pa.time64('ns'): 'io.deephaven.time.DateTime',
        pa.timestamp('us', tz=None): '',
        pa.timestamp('ns', tz=None): '',
        pa.date32(): 'java.time.LocalDate',
        pa.date64(): 'java.time.LocalDate',
        pa.binary(): '',
        pa.string(): 'java.lang.String',
        pa.utf8(): 'java.lang.String',
        pa.large_binary(): '',
        pa.large_string(): '',
        pa.large_utf8(): '',
        # decimal128(int precision, int scale=0)
        # list_(value_type, int list_size=-1)
        # large_list(value_type)
        # map_(key_type, item_type[, keys_sorted])
        # struct(fields)
        # dictionary(index_type, value_type, …)
        # field(name, type, bool nullable = True[, metadata])
        # schema(fields[, metadata])
        # from_numpy_dtype(dtype)
    }

    dh_type = arrow_to_dh.get(arrow_type)
    if not dh_type:
        # if this is a case of timestamp with tz specified
        if isinstance(arrow_type, pa.TimestampType):
            dh_type = "io.deephaven.time.DateTime"

    if not dh_type:
        raise DHError(f'unsupported arrow data type : {arrow_type}')

    return {"deephaven:type": dh_type}
Esempio n. 30
0
def test__type_to_expression():
    assert "int" == _type_to_expression(pa.int32())
    assert "datetime" == _type_to_expression(TRIAD_DEFAULT_TIMESTAMP)
    assert "timestamp(ns,America/New_York)" == _type_to_expression(
        pa.timestamp("ns", "America/New_York"))
    assert "timestamp(s)" == _type_to_expression(pa.timestamp("s"))
    assert "decimal(5)" == _type_to_expression(pa.decimal128(5))
    assert "decimal(5,2)" == _type_to_expression(pa.decimal128(5, 2))
    assert "bytes" == _type_to_expression(pa.binary())
    assert "bytes" == _type_to_expression(pa.binary(-1))
    raises(NotImplementedError, lambda: _type_to_expression(pa.binary(0)))
    raises(NotImplementedError, lambda: _type_to_expression(pa.binary(-2)))
    raises(NotImplementedError, lambda: _type_to_expression(pa.binary(1)))
    raises(NotImplementedError, lambda: _type_to_expression(pa.large_binary()))