コード例 #1
0
ファイル: arrow_v0.py プロジェクト: vishalbelsare/cjworkbench
def _DEPRECATED_overwrite_to_fix_arrow_table_schema(
        path: Path, fallback_schema: pa.Schema) -> None:
    if not path.stat().st_size:
        return

    table = load_trusted_arrow_file(path)

    untyped_schema = table.schema
    fields = [
        __DEPRECATED_fix_field(
            untyped_schema.field(i),
            (None if fallback_schema.get_field_index(name) == -1 else
             fallback_schema.field(fallback_schema.get_field_index(name))),
        ) for i, name in enumerate(untyped_schema.names)
    ]
    schema = pa.schema(fields)

    # Overwrite with new data
    #
    # We don't short-circuit by comparing schemas: two pa.Schema values
    # with different number formats evaluate as equal.
    #
    # We write a separate file to /var/tmp and then copy it: our sandbox
    # won't let us `rename(2)` in `path`'s directory.
    with tempfile_context(dir="/var/tmp") as rewrite_path:
        with pa.ipc.RecordBatchFileWriter(rewrite_path, schema) as writer:
            writer.write_table(pa.table(table.columns, schema=schema))
        shutil.copyfile(rewrite_path, path)
コード例 #2
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super(_SparseTensorHandler, self).__init__(
       arrow_schema, tensor_representation)
   sparse_representation = tensor_representation.sparse_tensor
   self._index_column_indices = tuple(
       arrow_schema.get_field_index(c)
       for c in sparse_representation.index_column_names)
   self._value_column_index = arrow_schema.get_field_index(
       sparse_representation.value_column_name)
   self._shape = [dim.size for dim in sparse_representation.dense_shape.dim]
   _, value_type = _GetNestDepthAndValueType(
       arrow_schema[self._value_column_index])
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._coo_size = len(self._shape) + 1
コード例 #3
0
ファイル: tensor_adapter.py プロジェクト: tensorflow/tfx-bsl
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   sparse_representation = tensor_representation.sparse_tensor
   self._index_column_indices = tuple(
       arrow_schema.get_field_index(c)
       for c in sparse_representation.index_column_names)
   self._value_column_index = arrow_schema.get_field_index(
       sparse_representation.value_column_name)
   self._shape = [dim.size for dim in sparse_representation.dense_shape.dim]
   _, value_type = _GetNestDepthAndValueType(
       arrow_schema, path.ColumnPath(sparse_representation.value_column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._coo_size = len(self._shape) + 1
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
コード例 #4
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super(_VarLenSparseTensorHandler, self).__init__(
       arrow_schema, tensor_representation)
   column_name = tensor_representation.varlen_sparse_tensor.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema[self._column_index])
   self._dtype = _ArrowTypeToTfDtype(value_type)
コード例 #5
0
ファイル: tensor_adapter.py プロジェクト: tensorflow/tfx-bsl
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   column_name = tensor_representation.varlen_sparse_tensor.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema,
                                             path.ColumnPath(column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
コード例 #6
0
def _set_date_column_type_to_timestamp_ms(schema: pa.Schema) -> pa.Schema:
    dt_timestamp_ms = pa.timestamp("ms")

    indexof_date_field = schema.get_field_index("DATE")

    types = schema.types
    types[indexof_date_field] = dt_timestamp_ms

    field_list = zip(schema.names, types)
    return pa.schema(field_list)
コード例 #7
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
     super(_RaggedTensorHandler, self).__init__(arrow_schema,
                                                tensor_representation)
     ragged_representation = tensor_representation.ragged_tensor
     self._steps = list(ragged_representation.feature_path.step)
     self._column_index = arrow_schema.get_field_index(self._steps[0])
     self._ragged_rank, value_type = _GetNestDepthAndValueType(
         arrow_schema, self._steps)
     self._dtype = _ArrowTypeToTfDtype(value_type)
     self._row_partition_dtype = ragged_representation.row_partition_dtype
     self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
コード例 #8
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super(_BaseDenseTensorHandler, self).__init__(arrow_schema,
                                                 tensor_representation)
   dense_rep = tensor_representation.dense_tensor
   column_name = dense_rep.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema[self._column_index])
   self._dtype = _ArrowTypeToTfDtype(value_type)
   unbatched_shape = [
       d.size for d in tensor_representation.dense_tensor.shape.dim
   ]
   self._shape = [None] + unbatched_shape
   self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
コード例 #9
0
ファイル: tensor_adapter.py プロジェクト: tensorflow/tfx-bsl
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   dense_rep = tensor_representation.dense_tensor
   column_name = dense_rep.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema,
                                             path.ColumnPath(column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
   unbatched_shape = [
       d.size for d in tensor_representation.dense_tensor.shape.dim
   ]
   self._shape = [None] + unbatched_shape
   self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
コード例 #10
0
ファイル: tensor_adapter.py プロジェクト: tensorflow/tfx-bsl
  def __init__(self, arrow_schema: pa.Schema,
               tensor_representation: schema_pb2.TensorRepresentation):
    super().__init__(arrow_schema, tensor_representation)
    ragged_representation = tensor_representation.ragged_tensor

    self._value_path = path.ColumnPath.from_proto(
        ragged_representation.feature_path)
    self._column_index = arrow_schema.get_field_index(
        ragged_representation.feature_path.step[0])
    self._outer_ragged_rank, value_type = _GetNestDepthAndValueType(
        arrow_schema, self._value_path)

    # Split partitions to the ones defining Ragged dimensions and the ones
    # defining the outer dimensions shape (through uniform row length
    # partitions).
    fixed_dimension = True
    ragged_partitions = []
    fixed_dimension_partitions = []
    # Reverse through the partitions (from outer partition to inner), in order
    # to extract the inner fixed shape of the resulting RaggedTensor.
    for partition in reversed(ragged_representation.partition):
      if partition.HasField("uniform_row_length") and fixed_dimension:
        fixed_dimension_partitions.append(partition)
      else:
        fixed_dimension = False
        ragged_partitions.append(partition)
    self._ragged_partitions = ragged_partitions[::-1]
    self._fixed_dimension_partitions = fixed_dimension_partitions[::-1]

    inner_fixed_shape = []
    inferred_dimensions_elements = 1
    for partition in self._fixed_dimension_partitions:
      inner_fixed_shape.append(partition.uniform_row_length)
      inferred_dimensions_elements *= partition.uniform_row_length
    self._inner_fixed_shape = inner_fixed_shape
    self._values_fixed_shape = [-1] + inner_fixed_shape
    self._inferred_dimensions_elements = inferred_dimensions_elements

    self._dtype = _ArrowTypeToTfDtype(value_type)
    self._row_partition_dtype = ragged_representation.row_partition_dtype
    self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)