Beispiel #1
0
    def testSaveWithOptions(self) -> None:
        tmp_folder = self.make_tempdir()
        tmp_file = str(tmp_folder / "save.output")

        num_elems = 1234
        blobs = self.create_test_blobs(num_elems)

        # Saves the blobs to a local db.
        save_op = core.CreateOperator(
            "Save",
            [name for name, data in blobs],
            [],
            absolute_path=1,
            db=tmp_file,
            db_type=self._db_type,
            chunk_size=40,
            options=caffe2_pb2.SerializationOptions(
                options=[
                    BlobSerializationOptions(
                        blob_name_regex="int16_data", chunk_size=10
                    ),
                    BlobSerializationOptions(
                        blob_name_regex=".*16_data", chunk_size=20
                    ),
                    BlobSerializationOptions(
                        blob_name_regex="float16_data", chunk_size=30
                    ),
                ],
            ),
        )
        self.assertTrue(workspace.RunOperatorOnce(save_op))

        self.load_and_check_blobs(blobs, [tmp_file])

        blob_chunks = self._read_chunk_info(Path(tmp_file))
        # We explicitly set a chunk_size of 10 for int16_data
        self.assertEqual(
            len(blob_chunks["int16_data"]), math.ceil(num_elems / 10)
        )
        # uint16_data should match the .*16_data pattern, and get a size of 20
        self.assertEqual(
            len(blob_chunks["uint16_data"]), math.ceil(num_elems / 20)
        )
        # float16_data should also match the .*16_data pattern, and get a size
        # of 20.  The explicitly float16_data rule came after the .*16_data
        # pattern, so it has lower precedence and will be ignored.
        self.assertEqual(
            len(blob_chunks["float16_data"]), math.ceil(num_elems / 20)
        )
        # int64_data will get the default chunk_size of 40
        self.assertEqual(
            len(blob_chunks["int64_data"]), math.ceil(num_elems / 40)
        )
Beispiel #2
0
    def testSaveFloatToBfloat16(self) -> None:
        tmp_folder = self.make_tempdir()
        tmp_file = str(tmp_folder / "save.output")

        # Create 2 blobs with the same float data
        float_data = np.random.random_sample(4000).astype(np.float32)
        workspace.FeedBlob("float1", float_data)
        workspace.FeedBlob("float2", float_data)
        blob_names = ["float1", "float2"]

        # Serialize the data, using bfloat16 serialization for one of the blobs
        save_op = core.CreateOperator(
            "Save",
            blob_names,
            [],
            absolute_path=1,
            db=tmp_file,
            db_type=self._db_type,
            options=caffe2_pb2.SerializationOptions(
                options=[
                    BlobSerializationOptions(
                        blob_name_regex="float1",
                        float_format=BlobSerializationOptions.FLOAT_BFLOAT16,
                    ),
                ],
            ),
        )
        self.assertTrue(workspace.RunOperatorOnce(save_op))

        # As long as fbgemm was available for us to perform bfloat16 conversion,
        # the serialized data for float1 should be almost half the size of float2
        if workspace.has_fbgemm:
            blob_chunks = self._read_chunk_info(Path(tmp_file))
            self.assertEqual(len(blob_chunks["float1"]), 1, blob_chunks["float1"])
            self.assertEqual(len(blob_chunks["float2"]), 1, blob_chunks["float2"])
            self.assertLess(
                blob_chunks["float1"][0].value_size,
                0.6 * blob_chunks["float2"][0].value_size
            )

        self.load_blobs(blob_names, [tmp_file])

        # float2 should be exactly the same as the input data
        np.testing.assert_array_equal(workspace.FetchBlob("float2"), float_data)
        # float2 should be close-ish to the input data
        np.testing.assert_array_almost_equal(
            workspace.FetchBlob("float1"), float_data, decimal=2
        )
Beispiel #3
0
    def testEstimateBlobSizes(self) -> None:
        # Create some blobs to test with
        float_data = np.random.random_sample(4000).astype(np.float32)
        workspace.FeedBlob("float1", float_data)
        workspace.FeedBlob("float2", float_data)
        workspace.FeedBlob(
            "float3", np.random.random_sample(2).astype(np.float32)
        )
        workspace.FeedBlob(
            "ui16", np.random.randint(0, 0xffff, size=1024, dtype=np.uint16)
        )

        # Estimate the serialized size of the data.
        # Request bfloat16 serialization for one of the float blobs, just to
        # exercise size estimation when using this option.
        options = caffe2_pb2.SerializationOptions(
            options=[
                BlobSerializationOptions(
                    blob_name_regex="float1",
                    float_format=BlobSerializationOptions.FLOAT_BFLOAT16,
                    chunk_size=500,
                ),
            ],
        )
        get_blobs_op = core.CreateOperator(
            "EstimateAllBlobSizes",
            [],
            ["blob_names", "blob_sizes"],
            options=options,
        )
        self.assertTrue(workspace.RunOperatorOnce(get_blobs_op))
        blob_names = workspace.FetchBlob("blob_names")
        blob_sizes = workspace.FetchBlob("blob_sizes")

        sizes_by_name: Dict[str, int] = {}
        for idx, name in enumerate(blob_names):
            sizes_by_name[name.decode("utf-8")] = blob_sizes[idx]

        # Note that the output blob list will include our output blob names.
        expected_blobs = [
            "float1", "float2", "float3", "ui16",
            "blob_names", "blob_sizes"
        ]
        self.assertEqual(set(sizes_by_name.keys()), set(expected_blobs))

        def check_expected_blob_size(
            name: str, num_elems: int, elem_size: int, num_chunks: int = 1
        ) -> None:
            # The estimation code applies a fixed 40 byte per-chunk overhead to
            # account for the extra space required for other fixed TensorProto
            # message fields.
            per_chunk_overhead = 50
            expected_size = (
                (num_chunks * (len(name) + per_chunk_overhead))
                + (num_elems * elem_size)
            )
            self.assertEqual(
                sizes_by_name[name],
                expected_size,
                f"expected size mismatch for {name}"
            )

        check_expected_blob_size("ui16", 1024, 3)
        check_expected_blob_size("float2", 4000, 4)
        check_expected_blob_size("float3", 2, 4)

        # Our serialization options request to split float1 into 500-element
        # chunks when saving it.  If fbgemm is available then the float1 blob
        # will be serialized using 2 bytes per element instead of 4 bytes.
        float1_num_chunks = 4000 // 500
        if workspace.has_fbgemm:
            check_expected_blob_size("float1", 4000, 2, float1_num_chunks)
        else:
            check_expected_blob_size("float1", 4000, 4, float1_num_chunks)

        check_expected_blob_size("blob_names", len(expected_blobs), 50)
        check_expected_blob_size("blob_sizes", len(expected_blobs), 8)

        # Now actually save the blobs so we can compare our estimates
        # to how big the serialized data actually is.
        tmp_folder = self.make_tempdir()
        tmp_file = str(tmp_folder / "save.output")
        save_op = core.CreateOperator(
            "Save",
            list(sizes_by_name.keys()),
            [],
            absolute_path=1,
            db=tmp_file,
            db_type=self._db_type,
            options=options,
        )
        self.assertTrue(workspace.RunOperatorOnce(save_op))

        blob_chunks = self._read_chunk_info(Path(tmp_file))
        saved_sizes: Dict[str, int] = {}
        for blob_name, chunks in blob_chunks.items():
            total_size = sum(chunk.value_size for chunk in chunks)
            saved_sizes[blob_name] = total_size

        # For sanity checking, ensure that our estimates aren't
        # extremely far off
        for name in expected_blobs:
            estimated_size = sizes_by_name[name]
            saved_size = saved_sizes[name]
            difference = abs(estimated_size - saved_size)
            error_pct = 100.0 * (difference / saved_size)
            print(
                f"{name}: estimated={estimated_size} actual={saved_size} "
                f"error={error_pct:.2f}%"
            )
            # Don't check the blob_names blob.  It is a string tensor, and we
            # can't estimate string tensor sizes very well without knowing the
            # individual string lengths.  (Currently it requires 102 bytes to
            # save, but we estimate 360).
            if name == "blob_names":
                continue
            # Check that we are within 100 bytes, or within 25%
            # We are generally quite close for tensors with fixed-width fields
            # (like float), but a little farther off for tensors that use varint
            # encoding.
            if difference > 100:
                self.assertLess(error_pct, 25.0)