def _DEPRECATED_overwrite_to_fix_arrow_table_schema( path: Path, fallback_schema: pa.Schema) -> None: if not path.stat().st_size: return table = load_trusted_arrow_file(path) untyped_schema = table.schema fields = [ __DEPRECATED_fix_field( untyped_schema.field(i), (None if fallback_schema.get_field_index(name) == -1 else fallback_schema.field(fallback_schema.get_field_index(name))), ) for i, name in enumerate(untyped_schema.names) ] schema = pa.schema(fields) # Overwrite with new data # # We don't short-circuit by comparing schemas: two pa.Schema values # with different number formats evaluate as equal. # # We write a separate file to /var/tmp and then copy it: our sandbox # won't let us `rename(2)` in `path`'s directory. with tempfile_context(dir="/var/tmp") as rewrite_path: with pa.ipc.RecordBatchFileWriter(rewrite_path, schema) as writer: writer.write_table(pa.table(table.columns, schema=schema)) shutil.copyfile(rewrite_path, path)
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super(_SparseTensorHandler, self).__init__( arrow_schema, tensor_representation) sparse_representation = tensor_representation.sparse_tensor self._index_column_indices = tuple( arrow_schema.get_field_index(c) for c in sparse_representation.index_column_names) self._value_column_index = arrow_schema.get_field_index( sparse_representation.value_column_name) self._shape = [dim.size for dim in sparse_representation.dense_shape.dim] _, value_type = _GetNestDepthAndValueType( arrow_schema[self._value_column_index]) self._dtype = _ArrowTypeToTfDtype(value_type) self._coo_size = len(self._shape) + 1
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super().__init__(arrow_schema, tensor_representation) sparse_representation = tensor_representation.sparse_tensor self._index_column_indices = tuple( arrow_schema.get_field_index(c) for c in sparse_representation.index_column_names) self._value_column_index = arrow_schema.get_field_index( sparse_representation.value_column_name) self._shape = [dim.size for dim in sparse_representation.dense_shape.dim] _, value_type = _GetNestDepthAndValueType( arrow_schema, path.ColumnPath(sparse_representation.value_column_name)) self._dtype = _ArrowTypeToTfDtype(value_type) self._coo_size = len(self._shape) + 1 self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super(_VarLenSparseTensorHandler, self).__init__( arrow_schema, tensor_representation) column_name = tensor_representation.varlen_sparse_tensor.column_name self._column_index = arrow_schema.get_field_index(column_name) _, value_type = _GetNestDepthAndValueType(arrow_schema[self._column_index]) self._dtype = _ArrowTypeToTfDtype(value_type)
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super().__init__(arrow_schema, tensor_representation) column_name = tensor_representation.varlen_sparse_tensor.column_name self._column_index = arrow_schema.get_field_index(column_name) _, value_type = _GetNestDepthAndValueType(arrow_schema, path.ColumnPath(column_name)) self._dtype = _ArrowTypeToTfDtype(value_type) self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
def _set_date_column_type_to_timestamp_ms(schema: pa.Schema) -> pa.Schema: dt_timestamp_ms = pa.timestamp("ms") indexof_date_field = schema.get_field_index("DATE") types = schema.types types[indexof_date_field] = dt_timestamp_ms field_list = zip(schema.names, types) return pa.schema(field_list)
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super(_RaggedTensorHandler, self).__init__(arrow_schema, tensor_representation) ragged_representation = tensor_representation.ragged_tensor self._steps = list(ragged_representation.feature_path.step) self._column_index = arrow_schema.get_field_index(self._steps[0]) self._ragged_rank, value_type = _GetNestDepthAndValueType( arrow_schema, self._steps) self._dtype = _ArrowTypeToTfDtype(value_type) self._row_partition_dtype = ragged_representation.row_partition_dtype self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super(_BaseDenseTensorHandler, self).__init__(arrow_schema, tensor_representation) dense_rep = tensor_representation.dense_tensor column_name = dense_rep.column_name self._column_index = arrow_schema.get_field_index(column_name) _, value_type = _GetNestDepthAndValueType(arrow_schema[self._column_index]) self._dtype = _ArrowTypeToTfDtype(value_type) unbatched_shape = [ d.size for d in tensor_representation.dense_tensor.shape.dim ] self._shape = [None] + unbatched_shape self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super().__init__(arrow_schema, tensor_representation) dense_rep = tensor_representation.dense_tensor column_name = dense_rep.column_name self._column_index = arrow_schema.get_field_index(column_name) _, value_type = _GetNestDepthAndValueType(arrow_schema, path.ColumnPath(column_name)) self._dtype = _ArrowTypeToTfDtype(value_type) self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type) unbatched_shape = [ d.size for d in tensor_representation.dense_tensor.shape.dim ] self._shape = [None] + unbatched_shape self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
def __init__(self, arrow_schema: pa.Schema, tensor_representation: schema_pb2.TensorRepresentation): super().__init__(arrow_schema, tensor_representation) ragged_representation = tensor_representation.ragged_tensor self._value_path = path.ColumnPath.from_proto( ragged_representation.feature_path) self._column_index = arrow_schema.get_field_index( ragged_representation.feature_path.step[0]) self._outer_ragged_rank, value_type = _GetNestDepthAndValueType( arrow_schema, self._value_path) # Split partitions to the ones defining Ragged dimensions and the ones # defining the outer dimensions shape (through uniform row length # partitions). fixed_dimension = True ragged_partitions = [] fixed_dimension_partitions = [] # Reverse through the partitions (from outer partition to inner), in order # to extract the inner fixed shape of the resulting RaggedTensor. for partition in reversed(ragged_representation.partition): if partition.HasField("uniform_row_length") and fixed_dimension: fixed_dimension_partitions.append(partition) else: fixed_dimension = False ragged_partitions.append(partition) self._ragged_partitions = ragged_partitions[::-1] self._fixed_dimension_partitions = fixed_dimension_partitions[::-1] inner_fixed_shape = [] inferred_dimensions_elements = 1 for partition in self._fixed_dimension_partitions: inner_fixed_shape.append(partition.uniform_row_length) inferred_dimensions_elements *= partition.uniform_row_length self._inner_fixed_shape = inner_fixed_shape self._values_fixed_shape = [-1] + inner_fixed_shape self._inferred_dimensions_elements = inferred_dimensions_elements self._dtype = _ArrowTypeToTfDtype(value_type) self._row_partition_dtype = ragged_representation.row_partition_dtype self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)