Beispiel #1
0
def serialize_tf_example(datum):
    """Serialize example into tfrecord.Example proto. 

    Args:
        Datum is a dictionary of tuples of form (value, dtype). dtype can be "byte", "float" or "int".
    Returns:
        Serialized tfrecord.example to bytes.
    """
    features = {}
    for key, (value, dtype) in datum.items():
        feature = {
            "byte":
            lambda f: example_pb2.Feature(bytes_list=example_pb2.BytesList(
                value=f)),
            "float":
            lambda f: example_pb2.Feature(float_list=example_pb2.FloatList(
                value=f)),
            "int":
            lambda f: example_pb2.Feature(int64_list=example_pb2.Int64List(
                value=f))
        }[dtype](value)
        features[key] = feature

    example_proto = example_pb2.Example(features=example_pb2.Features(
        feature=features))
    return example_proto.SerializeToString()
Beispiel #2
0
def example_loader(
    data_path: str,
    index_path: typing.Union[str, None],
    description: typing.Union[typing.List[str], typing.Dict[str, str],
                              None] = None,
    shard: typing.Optional[typing.Tuple[int, int]] = None,
) -> typing.Iterable[typing.Dict[str, np.ndarray]]:
    """Create an iterator over the (decoded) examples contained within
    the dataset.

    Decodes raw bytes of the features (contained within the dataset)
    into its respective format.

    Params:
    -------
    data_path: str
        TFRecord file path.

    index_path: str or None
        Index file path. Can be set to None if no file is available.

    description: list or dict of str, optional, default=None
        List of keys or dict of (key, value) pairs to extract from each
        record. The keys represent the name of the features and the
        values ("byte", "float", or "int") correspond to the data type.
        If dtypes are provided, then they are verified against the
        inferred type for compatibility purposes. If None (default),
        then all features contained in the file are extracted.

    shard: tuple of ints, optional, default=None
        A tuple (index, count) representing worker_id and num_workers
        count. Necessary to evenly split/shard the dataset among many
        workers (i.e. >1).

    Yields:
    -------
    features: dict of {str, np.ndarray}
        Decoded bytes of the features into its respective data type (for
        an individual record).
    """

    typename_mapping = {
        "byte": "bytes_list",
        "float": "float_list",
        "int": "int64_list"
    }

    record_iterator = tfrecord_iterator(data_path, index_path, shard)

    for record in record_iterator:
        example = example_pb2.Example()
        example.ParseFromString(record)

        yield extract_feature_dict(example.features, description,
                                   typename_mapping)
Beispiel #3
0
    def serialize_tf_example(
            datum: typing.Dict[str, typing.Tuple[typing.Any, str]]) -> bytes:
        """Serialize example into tfrecord.Example proto.

        Params:
        -------
        datum: dict
            Dictionary of tuples of form (value, dtype). dtype can be
            "byte", "float" or "int".

        Returns:
        --------
        proto: bytes
            Serialized tfrecord.example to bytes.
        """
        feature_map = {
            "byte":
            lambda f: example_pb2.Feature(bytes_list=example_pb2.BytesList(
                value=f)),
            "float":
            lambda f: example_pb2.Feature(float_list=example_pb2.FloatList(
                value=f)),
            "int":
            lambda f: example_pb2.Feature(int64_list=example_pb2.Int64List(
                value=f))
        }

        def serialize(value, dtype):
            if not isinstance(value, (list, tuple, np.ndarray)):
                value = [value]
            return feature_map[dtype](value)

        features = {
            key: serialize(value, dtype)
            for key, (value, dtype) in datum.items()
        }
        example_proto = example_pb2.Example(features=example_pb2.Features(
            feature=features))
        return example_proto.SerializeToString()
Beispiel #4
0
def tfrecord_loader(data_path, index_path, description, shard=None):
    """Create an iterator from a tfrecord dataset. 

    Args:
        data_path: Path of the input data.
        index_path: Path of index file. This can be set to None if not available.
        description: A dictionary of key and values where keys are the name of the features and values correspond to
                     data type. The data type can be "byte", "float" or "int".
        shard: A tuple (index, count) representing the shard information. (default : None)
    Returns:
        An iterator that generates individual data records.
    """
    record_iterator = tfrecord_iterator(data_path, index_path, shard)

    for record in record_iterator:
        example = example_pb2.Example()
        example.ParseFromString(record)

        features = {}
        for key, typename in description.items():
            tf_typename = {
                "byte": "bytes_list",
                "float": "float_list",
                "int": "int64_list"
            }[typename]
            if key not in example.features.feature:
                raise ValueError("Key {} doesn't exist.".format(key))
            value = getattr(example.features.feature[key], tf_typename).value
            if typename == "byte":
                value = np.frombuffer(value[0], dtype=np.uint8)
            elif typename == "float":
                value = np.array(value, dtype=np.float32)
            elif typename == "int":
                value = np.array(value, dtype=np.int32)
            features[key] = value

        yield features
Beispiel #5
0
def tfrecord_loader(data_path: str,
                    index_path: typing.Union[str, None],
                    description: typing.Union[typing.List[str], typing.Dict[str, str], None] = None,
                    shard: typing.Optional[typing.Tuple[int, int]] = None,
                    ) -> typing.Iterable[typing.Dict[str, np.ndarray]]:
    """Create an iterator over the (decoded) examples contained within
    the dataset.

    Decodes raw bytes of the features (contained within the dataset)
    into its respective format.

    Params:
    -------
    data_path: str
        TFRecord file path.

    index_path: str or None
        Index file path. Can be set to None if no file is available.

    description: list or dict of str, optional, default=None
        List of keys or dict of (key, value) pairs to extract from each
        record. The keys represent the name of the features and the
        values ("byte", "float", or "int") correspond to the data type.
        If dtypes are provided, then they are verified against the
        inferred type for compatibility purposes. If None (default),
        then all features contained in the file are extracted.

    shard: tuple of ints, optional, default=None
        A tuple (index, count) representing worker_id and num_workers
        count. Necessary to evenly split/shard the dataset among many
        workers (i.e. >1).

    Yields:
    -------
    features: dict of {str, np.ndarray}
        Decoded bytes of the features into its respective data type (for
        an individual record).
    """

    typename_mapping = {
        "byte": "bytes_list",
        "float": "float_list",
        "int": "int64_list"
    }

    record_iterator = tfrecord_iterator(data_path, index_path, shard)

    for record in record_iterator:
        example = example_pb2.Example()
        example.ParseFromString(record)

        all_keys = list(example.features.feature.keys())
        if description is None:
            description = dict.fromkeys(all_keys, None)
        elif isinstance(description, list):
            description = dict.fromkeys(description, None)

        features = {}
        for key, typename in description.items():
            if key not in all_keys:
                raise KeyError(f"Key {key} doesn't exist (select from {all_keys})!")
            # NOTE: We assume that each key in the example has only one field
            # (either "bytes_list", "float_list", or "int64_list")!
            field = example.features.feature[key].ListFields()[0]
            inferred_typename, value = field[0].name, field[1].value
            if typename is not None:
                tf_typename = typename_mapping[typename]
                if tf_typename != inferred_typename:
                    reversed_mapping = {v: k for k, v in typename_mapping.items()}
                    raise TypeError(f"Incompatible type '{typename}' for `{key}` "
                                    f"(should be '{reversed_mapping[inferred_typename]}').")

            # Decode raw bytes into respective data types
            if inferred_typename == "bytes_list":
                value = np.frombuffer(value[0], dtype=np.uint8)
            elif inferred_typename == "float_list":
                value = np.array(value, dtype=np.float32)
            elif inferred_typename == "int64_list":
                value = np.array(value, dtype=np.int32)
            features[key] = value

        yield features