def __init__(self, example_specs, path, hash_salt, disable_shuffling, file_format=file_adapters.DEFAULT_FILE_FORMAT): """Init BeamWriter. Args: example_specs: path: str, path where to write tfrecord file. Eg: "/foo/mnist-train.tfrecord". The suffix (eg: `.00000-of-00004` will be added by the BeamWriter. Note that file "{path}.shard_lengths.json" is also created. It contains a list with the number of examples in each final shard. Eg: "[10,11,10,11]". hash_salt: string, the salt to use for hashing of keys. disable_shuffling: bool, specifies whether to shuffle the records. file_format: file_adapters.FileFormat, format of the record files in which the dataset will be read/written from. """ self._original_state = dict(example_specs=example_specs, path=path, hash_salt=hash_salt, disable_shuffling=disable_shuffling, file_format=file_format) self._path = os.fspath(path) self._split_info_path = "%s.split_info.json" % path self._serializer = example_serializer.ExampleSerializer(example_specs) self._example_specs = example_specs self._hasher = hashing.Hasher(hash_salt) self._split_info = None self._file_format = file_format self._disable_shuffling = disable_shuffling
def __init__( self, serializer: example_serializer.Serializer, filename_template: naming.ShardedFileTemplate, hash_salt, disable_shuffling: bool, file_format: file_adapters.FileFormat = file_adapters. DEFAULT_FILE_FORMAT, ): """Init BeamWriter. Note that file "{filepath_prefix}.shard_lengths.json" is also created. It contains a list with the number of examples in each final shard. Eg: "[10,11,10,11]". Args: serializer: class that can serialize examples. filename_template: template to format sharded filenames. hash_salt: string, the salt to use for hashing of keys. disable_shuffling: bool, specifies whether to shuffle the records. file_format: file_adapters.FileFormat, format of the record files in which the dataset will be read/written from. """ self._original_state = dict(serializer=serializer, filename_template=filename_template, hash_salt=hash_salt, disable_shuffling=disable_shuffling, file_format=file_format) self._filename_template = filename_template self._split_info_path = f"{filename_template.filepath_prefix()}.split_info.json" self._serializer = serializer self._hasher = hashing.Hasher(hash_salt) self._split_info = None self._file_format = file_format self._disable_shuffling = disable_shuffling
def __init__(self, dirpath, hash_salt): """Initialize Shuffler. Args: dirpath (string): directory in which to store temporary files. hash_salt (string or bytes): salt to hash keys. """ grp_name = uuid.uuid4() self._hasher = hashing.Hasher(hash_salt) self._buckets = [] for i in range(BUCKETS_NUMBER): path = os.path.join(dirpath, 'bucket_%s_%03d.tmp' % (grp_name, i)) self._buckets.append(_Bucket(path)) self._read_only = False self._total_bytes = 0 # To keep data in memory until enough data has been gathered. self._in_memory = True self._mem_buffer = []
def __init__(self, example_specs, path, hash_salt): """Init BeamWriter. Args: example_specs: path: str, path where to write tfrecord file. Eg: "/foo/mnist-train.tfrecord". The suffix (eg: `.00000-of-00004` will be added by the BeamWriter. Note that file "{path}.shard_lengths.json" is also created. It contains a list with the number of examples in each final shard. Eg: "[10,11,10,11]". hash_salt: string, the salt to use for hashing of keys. """ self._original_state = dict(example_specs=example_specs, path=path, hash_salt=hash_salt) self._path = path self._shards_length_path = "%s.shard_lengths.json" % path self._serializer = example_serializer.ExampleSerializer(example_specs) self._hasher = hashing.Hasher(hash_salt) self._shard_lengths = None
def test_ascii(self): hasher = hashing.Hasher(salt='') res = hasher.hash_key('foo') self.assertEqual(res, 229609063533823256041787889330700985560)
def test_ints(self): hasher = hashing.Hasher(salt='') res = hasher.hash_key(0) self.assertEqual(res, 276215275525073243129443018166533317850) res = hasher.hash_key(123455678901234567890) self.assertEqual(res, 6876359009333865997613257802033240610)
def test_backslash(self): hasher = hashing.Hasher(salt='') res2 = hasher.hash_key('x/y') res1 = hasher.hash_key('x\\y') self.assertEqual(res1, res2) self.assertEqual(res1, 122546703782554533059483853573887619473)