Esempio n. 1
0
    def __init__(self,
                 example_specs,
                 path,
                 hash_salt,
                 disable_shuffling,
                 file_format=file_adapters.DEFAULT_FILE_FORMAT):
        """Init BeamWriter.

    Args:
      example_specs:
      path: str, path where to write tfrecord file. Eg:
        "/foo/mnist-train.tfrecord".
        The suffix (eg: `.00000-of-00004` will be added by the BeamWriter. Note
          that file "{path}.shard_lengths.json" is also created. It contains
          a list with the number of examples in each final shard. Eg:
            "[10,11,10,11]".
      hash_salt: string, the salt to use for hashing of keys.
      disable_shuffling: bool, specifies whether to shuffle the records.
      file_format: file_adapters.FileFormat, format of the record files in which
        the dataset will be read/written from.
    """
        self._original_state = dict(example_specs=example_specs,
                                    path=path,
                                    hash_salt=hash_salt,
                                    disable_shuffling=disable_shuffling,
                                    file_format=file_format)
        self._path = os.fspath(path)
        self._split_info_path = "%s.split_info.json" % path
        self._serializer = example_serializer.ExampleSerializer(example_specs)
        self._example_specs = example_specs
        self._hasher = hashing.Hasher(hash_salt)
        self._split_info = None
        self._file_format = file_format
        self._disable_shuffling = disable_shuffling
Esempio n. 2
0
    def __init__(
        self,
        serializer: example_serializer.Serializer,
        filename_template: naming.ShardedFileTemplate,
        hash_salt,
        disable_shuffling: bool,
        file_format: file_adapters.FileFormat = file_adapters.
        DEFAULT_FILE_FORMAT,
    ):
        """Init BeamWriter.

    Note that file "{filepath_prefix}.shard_lengths.json" is also created. It
    contains a list with the number of examples in each final shard. Eg:
    "[10,11,10,11]".

    Args:
      serializer: class that can serialize examples.
      filename_template: template to format sharded filenames.
      hash_salt: string, the salt to use for hashing of keys.
      disable_shuffling: bool, specifies whether to shuffle the records.
      file_format: file_adapters.FileFormat, format of the record files in which
        the dataset will be read/written from.
    """
        self._original_state = dict(serializer=serializer,
                                    filename_template=filename_template,
                                    hash_salt=hash_salt,
                                    disable_shuffling=disable_shuffling,
                                    file_format=file_format)
        self._filename_template = filename_template
        self._split_info_path = f"{filename_template.filepath_prefix()}.split_info.json"
        self._serializer = serializer
        self._hasher = hashing.Hasher(hash_salt)
        self._split_info = None
        self._file_format = file_format
        self._disable_shuffling = disable_shuffling
Esempio n. 3
0
    def __init__(self, dirpath, hash_salt):
        """Initialize Shuffler.

    Args:
      dirpath (string): directory in which to store temporary files.
      hash_salt (string or bytes): salt to hash keys.
    """
        grp_name = uuid.uuid4()
        self._hasher = hashing.Hasher(hash_salt)
        self._buckets = []
        for i in range(BUCKETS_NUMBER):
            path = os.path.join(dirpath, 'bucket_%s_%03d.tmp' % (grp_name, i))
            self._buckets.append(_Bucket(path))
        self._read_only = False
        self._total_bytes = 0
        # To keep data in memory until enough data has been gathered.
        self._in_memory = True
        self._mem_buffer = []
Esempio n. 4
0
    def __init__(self, example_specs, path, hash_salt):
        """Init BeamWriter.

    Args:
      example_specs:
      path: str, path where to write tfrecord file. Eg:
        "/foo/mnist-train.tfrecord".
        The suffix (eg: `.00000-of-00004` will be added by the BeamWriter.
        Note that file "{path}.shard_lengths.json" is also created. It contains
          a list with the number of examples in each final shard. Eg:
          "[10,11,10,11]".
      hash_salt: string, the salt to use for hashing of keys.
    """
        self._original_state = dict(example_specs=example_specs,
                                    path=path,
                                    hash_salt=hash_salt)
        self._path = path
        self._shards_length_path = "%s.shard_lengths.json" % path
        self._serializer = example_serializer.ExampleSerializer(example_specs)
        self._hasher = hashing.Hasher(hash_salt)
        self._shard_lengths = None
Esempio n. 5
0
 def test_ascii(self):
   hasher = hashing.Hasher(salt='')
   res = hasher.hash_key('foo')
   self.assertEqual(res, 229609063533823256041787889330700985560)
Esempio n. 6
0
 def test_ints(self):
   hasher = hashing.Hasher(salt='')
   res = hasher.hash_key(0)
   self.assertEqual(res, 276215275525073243129443018166533317850)
   res = hasher.hash_key(123455678901234567890)
   self.assertEqual(res, 6876359009333865997613257802033240610)
Esempio n. 7
0
 def test_backslash(self):
     hasher = hashing.Hasher(salt='')
     res2 = hasher.hash_key('x/y')
     res1 = hasher.hash_key('x\\y')
     self.assertEqual(res1, res2)
     self.assertEqual(res1, 122546703782554533059483853573887619473)