def serialize_tf_example(datum): """Serialize example into tfrecord.Example proto. Args: Datum is a dictionary of tuples of form (value, dtype). dtype can be "byte", "float" or "int". Returns: Serialized tfrecord.example to bytes. """ features = {} for key, (value, dtype) in datum.items(): feature = { "byte": lambda f: example_pb2.Feature(bytes_list=example_pb2.BytesList( value=f)), "float": lambda f: example_pb2.Feature(float_list=example_pb2.FloatList( value=f)), "int": lambda f: example_pb2.Feature(int64_list=example_pb2.Int64List( value=f)) }[dtype](value) features[key] = feature example_proto = example_pb2.Example(features=example_pb2.Features( feature=features)) return example_proto.SerializeToString()
def example_loader( data_path: str, index_path: typing.Union[str, None], description: typing.Union[typing.List[str], typing.Dict[str, str], None] = None, shard: typing.Optional[typing.Tuple[int, int]] = None, ) -> typing.Iterable[typing.Dict[str, np.ndarray]]: """Create an iterator over the (decoded) examples contained within the dataset. Decodes raw bytes of the features (contained within the dataset) into its respective format. Params: ------- data_path: str TFRecord file path. index_path: str or None Index file path. Can be set to None if no file is available. description: list or dict of str, optional, default=None List of keys or dict of (key, value) pairs to extract from each record. The keys represent the name of the features and the values ("byte", "float", or "int") correspond to the data type. If dtypes are provided, then they are verified against the inferred type for compatibility purposes. If None (default), then all features contained in the file are extracted. shard: tuple of ints, optional, default=None A tuple (index, count) representing worker_id and num_workers count. Necessary to evenly split/shard the dataset among many workers (i.e. >1). Yields: ------- features: dict of {str, np.ndarray} Decoded bytes of the features into its respective data type (for an individual record). """ typename_mapping = { "byte": "bytes_list", "float": "float_list", "int": "int64_list" } record_iterator = tfrecord_iterator(data_path, index_path, shard) for record in record_iterator: example = example_pb2.Example() example.ParseFromString(record) yield extract_feature_dict(example.features, description, typename_mapping)
def serialize_tf_example( datum: typing.Dict[str, typing.Tuple[typing.Any, str]]) -> bytes: """Serialize example into tfrecord.Example proto. Params: ------- datum: dict Dictionary of tuples of form (value, dtype). dtype can be "byte", "float" or "int". Returns: -------- proto: bytes Serialized tfrecord.example to bytes. """ feature_map = { "byte": lambda f: example_pb2.Feature(bytes_list=example_pb2.BytesList( value=f)), "float": lambda f: example_pb2.Feature(float_list=example_pb2.FloatList( value=f)), "int": lambda f: example_pb2.Feature(int64_list=example_pb2.Int64List( value=f)) } def serialize(value, dtype): if not isinstance(value, (list, tuple, np.ndarray)): value = [value] return feature_map[dtype](value) features = { key: serialize(value, dtype) for key, (value, dtype) in datum.items() } example_proto = example_pb2.Example(features=example_pb2.Features( feature=features)) return example_proto.SerializeToString()
def tfrecord_loader(data_path, index_path, description, shard=None): """Create an iterator from a tfrecord dataset. Args: data_path: Path of the input data. index_path: Path of index file. This can be set to None if not available. description: A dictionary of key and values where keys are the name of the features and values correspond to data type. The data type can be "byte", "float" or "int". shard: A tuple (index, count) representing the shard information. (default : None) Returns: An iterator that generates individual data records. """ record_iterator = tfrecord_iterator(data_path, index_path, shard) for record in record_iterator: example = example_pb2.Example() example.ParseFromString(record) features = {} for key, typename in description.items(): tf_typename = { "byte": "bytes_list", "float": "float_list", "int": "int64_list" }[typename] if key not in example.features.feature: raise ValueError("Key {} doesn't exist.".format(key)) value = getattr(example.features.feature[key], tf_typename).value if typename == "byte": value = np.frombuffer(value[0], dtype=np.uint8) elif typename == "float": value = np.array(value, dtype=np.float32) elif typename == "int": value = np.array(value, dtype=np.int32) features[key] = value yield features
def tfrecord_loader(data_path: str, index_path: typing.Union[str, None], description: typing.Union[typing.List[str], typing.Dict[str, str], None] = None, shard: typing.Optional[typing.Tuple[int, int]] = None, ) -> typing.Iterable[typing.Dict[str, np.ndarray]]: """Create an iterator over the (decoded) examples contained within the dataset. Decodes raw bytes of the features (contained within the dataset) into its respective format. Params: ------- data_path: str TFRecord file path. index_path: str or None Index file path. Can be set to None if no file is available. description: list or dict of str, optional, default=None List of keys or dict of (key, value) pairs to extract from each record. The keys represent the name of the features and the values ("byte", "float", or "int") correspond to the data type. If dtypes are provided, then they are verified against the inferred type for compatibility purposes. If None (default), then all features contained in the file are extracted. shard: tuple of ints, optional, default=None A tuple (index, count) representing worker_id and num_workers count. Necessary to evenly split/shard the dataset among many workers (i.e. >1). Yields: ------- features: dict of {str, np.ndarray} Decoded bytes of the features into its respective data type (for an individual record). """ typename_mapping = { "byte": "bytes_list", "float": "float_list", "int": "int64_list" } record_iterator = tfrecord_iterator(data_path, index_path, shard) for record in record_iterator: example = example_pb2.Example() example.ParseFromString(record) all_keys = list(example.features.feature.keys()) if description is None: description = dict.fromkeys(all_keys, None) elif isinstance(description, list): description = dict.fromkeys(description, None) features = {} for key, typename in description.items(): if key not in all_keys: raise KeyError(f"Key {key} doesn't exist (select from {all_keys})!") # NOTE: We assume that each key in the example has only one field # (either "bytes_list", "float_list", or "int64_list")! field = example.features.feature[key].ListFields()[0] inferred_typename, value = field[0].name, field[1].value if typename is not None: tf_typename = typename_mapping[typename] if tf_typename != inferred_typename: reversed_mapping = {v: k for k, v in typename_mapping.items()} raise TypeError(f"Incompatible type '{typename}' for `{key}` " f"(should be '{reversed_mapping[inferred_typename]}').") # Decode raw bytes into respective data types if inferred_typename == "bytes_list": value = np.frombuffer(value[0], dtype=np.uint8) elif inferred_typename == "float_list": value = np.array(value, dtype=np.float32) elif inferred_typename == "int64_list": value = np.array(value, dtype=np.int32) features[key] = value yield features