예제 #1
0
class Temperature:
    """A wrapper, non-instance class to do one-time initialization of get_system()"""
    system = get_system()

    @classmethod
    def _get_core_temps(cls):
        if cls.system.arch == Architecture.Xavier:
            # Because we don't have nvidia-smi on xavier, we need to use sysfs to read out the temperature
            # The type of the thermal_zone is in /sys/devices/virtual/thermal/termal_zone<N>/type.
            # To avoid doing a bunch of process spawn to check if a given node is a GPU node, we're gonna hardcode the GPU_therm node:
            # AGX_Xavier: thermal_zone1
            # Xavier_NX: thermal_zone1
            # NOTE, this may change in subsequent/previous submission models.
            try:
                out_text = run_command(
                    "cat /sys/devices/virtual/thermal/thermal_zone1/temp",
                    get_output=True,
                    tee=False)
                # The temperature is in units of milli degC, so scale the result:
                temps = [int(str_temp) / 1000 for str_temp in out_text]
            except Exception as e:
                print("Bad temp reading")
                raise e
        else:
            # Non-xavier branch
            try:
                out_text = run_command(
                    "nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader",
                    get_output=True,
                    tee=False)
                # multi-gpu instance return a list of strings corresponding to temp of each core
                temps = [int(str_temp) for str_temp in out_text]
            except Exception as e:
                print("Bad temp reading")
                raise e
        return temps

    @classmethod
    def logged_temp_wait(cls, temp, timeout=None):
        """ Spinwait on GPU temperature with optional timeout. Returns last measured temperature

        For multi-GPU systems, we use mean temperature.
        """
        start_time = time.perf_counter()
        # Poll with timeout
        succ = False
        while (time.perf_counter() - start_time) < timeout:
            temps = cls._get_core_temps()
            mean_temp = sum(temps) / len(temps)
            if mean_temp <= temp:
                print("GPU has finished cooling")
                succ = True
                break
            print(mean_temp)
            time.sleep(2)
        if not succ:
            print("GPU failed to fully cool")
        return mean_temp
예제 #2
0
    def __init__(self, args):
        workspace_size = dict_get(args, "workspace_size", default=(5 << 30))
        logging.info("Using workspace size: {:,}".format(workspace_size))
        super().__init__(args,
                         name=BENCHMARKS.BERT,
                         workspace_size=workspace_size)
        self.bert_config_path = "code/bert/tensorrt/bert_config.json"

        self.seq_len = 384  # default sequence length

        self.batch_size = dict_get(args, "batch_size", default=1)

        self.num_profiles = 1
        if 'gpu_inference_streams' in args:
            # use gpu_inference_streams to determine the number of duplicated profiles
            # in the engine when not using lwis mode
            self.num_profiles = args['gpu_inference_streams']

        self.is_int8 = args['precision'] == 'int8'

        if self.is_int8:
            self.model_path = dict_get(
                args,
                "model_path",
                default="build/models/bert/bert_large_v1_1_fake_quant.onnx")
        else:
            self.model_path = dict_get(
                args,
                "model_path",
                default="build/models/bert/bert_large_v1_1.onnx")

        self.bert_config = BertConfig(self.bert_config_path)

        self.enable_interleaved = False
        if self.is_int8 and 'enable_interleaved' in args:
            self.enable_interleaved = args['enable_interleaved']

        # Small-Tile GEMM Plugin
        # Since it doesn't support interleaved format, two options are mutually exclusive
        self.use_small_tile_gemm_plugin = self.args.get(
            "use_small_tile_gemm_plugin", False)
        self.gemm_plugin_fairshare_cache_size = self.args.get(
            "gemm_plugin_fairshare_cache_size", -1)
        if self.enable_interleaved and self.use_small_tile_gemm_plugin:
            assert False, "Small-Tile GEMM Plugin doesn't support interleaved format."

        # Query system id for architecture
        self.system = get_system()
        self.gpu_arch = self.system.arch

        if self.batch_size > 512:
            # tactics selection is limited at very large batch sizes
            self.builder_config.max_workspace_size = 7 << 30
        if 'nx' in self.system.gpu.lower():
            # use 1GB only for XavierNX
            self.builder_config.max_workspace_size = 1 << 30
예제 #3
0
    def initialize(self):
        """
        Parse input ONNX file to a TRT network. Apply layer optimizations and fusion plugins on network.
        """

        # Query system id for architecture
        self.system = get_system()
        self.gpu_arch = self.system.arch

        # Create network.
        self.network = self.builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

        # Parse from onnx file.
        parser = trt.OnnxParser(self.network, self.logger)

        rn50_gs = RN50GraphSurgeon(self.model_path, self.gpu_arch,
                                   self.device_type, self.precision,
                                   self.cache_file, self.need_calibration)
        model = rn50_gs.process_onnx()
        success = parser.parse(onnx._serialize(model))
        if not success:
            raise RuntimeError(
                "ResNet50 onnx model processing failed! Error: {:}".format(
                    parser.get_error(0).desc()))
        # unmarking topk_layer_output_value, just leaving topk_layer_output_index
        assert self.network.num_outputs == 2, "Two outputs expected"
        assert self.network.get_output(0).name == "topk_layer_output_value",\
            "unexpected tensor: {}".format(self.network.get_output(0).name)
        assert self.network.get_output(1).name == "topk_layer_output_index",\
            "unexpected tensor: {}".format(self.network.get_output(1).name)
        logging.info("Unmarking output: {:}".format(
            self.network.get_output(0).name))
        self.network.unmark_output(self.network.get_output(0))

        # Set input dtype and format
        input_tensor = self.network.get_input(0)
        if self.input_dtype == "int8":
            input_tensor.dtype = trt.int8
            input_tensor.dynamic_range = (-128, 127)
        if self.input_format == "linear":
            input_tensor.allowed_formats = 1 << int(trt.TensorFormat.LINEAR)
        elif self.input_format == "chw4":
            input_tensor.allowed_formats = 1 << int(trt.TensorFormat.CHW4)

        self.initialized = True
예제 #4
0
    def logged_temp_wait(cls, temp, timeout=None):
        """ Spinwait on GPU temperature with optional timeout. Returns last measured temperature

        For multi-GPU systems, we use mean temperature.
        """
        if cls.system is None:
            cls.system = get_system()
        start_time = time.perf_counter()
        # Poll with timeout
        succ = False
        while (time.perf_counter() - start_time) < timeout:
            temps = cls._get_core_temps()
            mean_temp = sum(temps) / len(temps)
            if mean_temp <= temp:
                print("GPU has finished cooling")
                succ = True
                break
            print(mean_temp)
            time.sleep(2)
        if not succ:
            print("GPU failed to fully cool")
        return mean_temp
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
sys.path.insert(0, os.getcwd())

import argparse

from code.common import get_system

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--gpu_architecture",
        help="Print GPU architecture class instead of system id",
        action="store_true"
    )
    args = parser.parse_args()

    system = get_system()

    if args.gpu_architecture:
        print(system.arch.name)
    else:
        print(system.get_id())
예제 #6
0
    def initialize(self):
        """
        Parse input ONNX file to a TRT network. Apply layer optimizations and fusion plugins on network.
        """

        # Query system id for architecture
        self.system = get_system()
        self.gpu_arch = self.system.arch

        # Create network.
        self.network = self.builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

        # Parse from onnx file.
        parser = trt.OnnxParser(self.network, self.logger)

        with open(self.model_path, "rb") as f:
            model = f.read()
        success = parser.parse(model)
        if not success:
            raise RuntimeError(
                "ofa_autusinian onnx model processing failed! Error: {:}".
                format(parser.get_error(0).desc()))
        # Set input dtype and format
        input_tensor = self.network.get_input(0)
        if self.input_dtype == "int8":
            input_tensor.dtype = trt.int8
            scale = struct.unpack('!f', bytes.fromhex('3caa5293'))[0]
            input_tensor.dynamic_range = (-scale * 127.0, scale * 127.0)
        if self.input_format == "linear":
            input_tensor.allowed_formats = 1 << int(trt.TensorFormat.LINEAR)
        elif self.input_format == "chw4":
            input_tensor.allowed_formats = 1 << int(trt.TensorFormat.CHW4)

        # Get the layers we care about.
        nb_layers = self.network.num_layers

        while self.network.num_outputs > 0:
            logging.info("Unmarking output: {:}".format(
                self.network.get_output(0).name))
            self.network.unmark_output(self.network.get_output(0))
        #add top-k
        last_fc_layer = self.network.get_layer(nb_layers - 1)
        topk_layer = self.network.add_topk(last_fc_layer.get_output(0),
                                           trt.TopKOperation.MAX, 1, 2)
        topk_layer.name = "topk_layer"
        topk_layer.get_output(0).name = "topk_layer_output_value"
        topk_layer.get_output(1).name = "topk_layer_output_index"
        self.network.mark_output(topk_layer.get_output(1))

        if self.network.num_outputs != 1:
            logging.warning(
                "num outputs should be 1 after unmarking! Has {:}".format(
                    self.network.num_outputs))
            raise Exception

        if self.precision == "int8" and self.batch_size > 1 and (
                not self.need_calibration):
            self.autosinian_optimize()

        self.initialized = True
예제 #7
0
    def __init__(self, bench, scen, config_dict, config_funcs=None):
        """ Construct a ConfigGrid

        Args:
            bench (str): The benchmark requested (fuzzy match behavior using BENCHMARKS.alias)
            scen (str): The scenario requested (fuzzy match behavior using SCENARIOS.alias)
            config_dict (Dict[str, List]): A config dictionary. Refer to 'Config Schema' in the README for format
            config_funcs (Dict[str, Callable]): A dictionary of META* functions. Refer to 'Config Schema' in the README for requirements.

        """
        if args.spoof_system_id:
            self.system_id = args.spoof_system_id
        else:
            self.system = get_system()
            self.system_id = self.system.get_id()
        self.benchmark = BENCHMARKS.alias(bench)
        self.scenario = SCENARIOS.alias(scen)
        candidate_configs = find_config_files(benchmarks=[self.benchmark],
                                              scenarios=[self.scenario])
        configs = load_configs(candidate_configs)
        assert len(configs) == 1
        # To work with "extends" and "scales", we need to call into another config helper:
        self.base_config = get_system_benchmark_config(configs[0], self.system_id)
        self.default_config = configs[0]['default']
        griddict = config_dict
        self.no_rebuild_params = None
        # No-op
        self.is_config_valid = lambda x: True
        # No-op
        self.search_callback = None
        self.replay = None
        funcs_processed = set()
        if config_funcs:
            if config_funcs.get("META_search_callback"):
                funcs_processed.add("META_search_callback")
                self.search_callback = config_funcs['META_search_callback']
                if not args.use_cached:
                    raise RuntimeError(f"META_search_callback must be used with --use_cached for reproducibility.")
            if config_funcs.get("META_get_no_rebuild_params"):
                funcs_processed.add("META_get_no_rebuild_params")
                norebuild_params = config_funcs.get("META_get_no_rebuild_params")()
                assert isinstance(norebuild_params, list)
                # Make sure these keys all exist in our grid params:
                # But we might not know grid params if a search_callback is being used:
                if self.search_callback is None:
                    missing_keys = set(norebuild_params) - set(griddict.keys())
                    if len(missing_keys) > 0:
                        raise RuntimeError(f"The keys: {missing_keys} were mentioned in META_get_no_rebuild_params, but are not a specified parameter in:\n{griddict.keys()}")
                else:
                    print("WARNING: Not checking get_no_rebuild_params against grid parameters, be careful")
                # For use later, we're gonna turn this into a set:
                self.no_rebuild_params = set(norebuild_params)
            if config_funcs.get("META_is_config_valid"):
                funcs_processed.add("META_is_config_valid")
                self.is_config_valid = config_funcs["META_is_config_valid"]

                # Make sure we aren't scanning other params:
            # Other META handling goes here
            unmatched_funcs = set(config_funcs.keys()) - funcs_processed
            if len(unmatched_funcs) > 0:
                raise RuntimeError(f"Found the following META functions which haven't been implemented, refer to README for proper naming {unmatched_funcs}")

        # Make sure we can set all keys are in our config:
        if not args.no_check_keys:
            for grid_key in griddict.keys():
                if grid_key not in self.base_config:
                    print(f"{grid_key} not found in base config")
                    print(f"{self.base_config}")
                    assert False
        # Make sure all values are non-empty lists of something that isn't a list or a dict
        # TODO expand this to something reasonable to help META_bisect
        if "META_search_callback" in funcs_processed:
            print("WARNING: Skipping parameter validation because META_search_callback was provided")
        else:
            for val in griddict.values():
                assert isinstance(val, list)
                #assert len(val) >= 1
                assert all(not isinstance(el, list) and not isinstance(el, dict) for el in val)
        self.grid = griddict
예제 #8
0
    def __init__(self, bench, scen, config_dict, config_funcs=None):
        """ Construct a ConfigGrid

        Args:
            bench (str): The benchmark requested (fuzzy match behavior using BENCHMARKS.alias)
            scen (str): The scenario requested (fuzzy match behavior using SCENARIOS.alias)
            config_dict (Dict[str, List]): A config dictionary. Refer to 'Config Schema' in the README for format
            config_funcs (Dict[str, Callable]): A dictionary of META* functions. Refer to 'Config Schema' in the README for requirements.

        """
        self.system = get_system()
        self.system_id = self.system.get_id()
        self.benchmark = BENCHMARKS.alias(bench)
        self.scenario = SCENARIOS.alias(scen)
        candidate_configs = find_config_files(benchmarks=[self.benchmark],
                                              scenarios=[self.scenario])
        configs = load_configs(candidate_configs)
        assert len(configs) == 1
        # To work with "extends" and "scales", we need to call into another config helper:
        self.base_config = get_system_benchmark_config(configs[0],
                                                       self.system_id)
        self.default_config = configs[0]['default']
        griddict = config_dict
        self.no_rebuild_params = None
        # No-op
        self.is_config_valid = lambda x: True
        funcs_processed = set()
        if config_funcs:
            if config_funcs.get("META_get_no_rebuild_params"):
                funcs_processed.add("META_get_no_rebuild_params")
                norebuild_params = config_funcs.get(
                    "META_get_no_rebuild_params")()
                assert isinstance(norebuild_params, list)
                # Make sure these keys all exist in our grid params:
                missing_keys = set(norebuild_params) - set(griddict.keys())
                if len(missing_keys) > 0:
                    raise RuntimeError(
                        f"The keys: {missing_keys} were mentioned in META_get_no_rebuild_params, but are not a specified parameter in:\n{griddict.keys()}"
                    )
                # For use later, we're gonna turn this into a set:
                self.no_rebuild_params = set(norebuild_params)
            if config_funcs.get("META_is_config_valid"):
                funcs_processed.add("META_is_config_valid")
                self.is_config_valid = config_funcs["META_is_config_valid"]
            # Other META handling goes here
            unmatched_funcs = set(config_funcs.keys()) - funcs_processed
            if len(unmatched_funcs) > 0:
                raise RuntimeError(
                    f"Found the following META functions which haven't been implemented, refer to README for proper naming {unmatched_funcs}"
                )

        # Make sure we can set all keys are in our config:
        for grid_key in griddict.keys():
            if grid_key not in self.base_config:
                print(f"{grid_key} not found in base config")
                print(f"{self.base_config}")
                assert False
        # Make sure all values are non-empty lists of something that isn't a list or a dict
        for val in griddict.values():
            assert isinstance(val, list)
            assert len(val) >= 1
            assert all(not isinstance(el, list) and not isinstance(el, dict)
                       for el in val)

        self.grid = griddict
예제 #9
0
    def initialize(self):
        """Create DLRM network using TRT API and plugins and set the weights."""

        useConvForFC_bottom = (self.precision == "int8")
        useConvForFC_top = (self.precision == "int8")
        interactionsOutputInterleaved = False if self.need_calibration or self.input_dtype != "int8" else True

        # Turn off interleaved format if top_mlp use non-interleaved format
        if not self.enable_interleaved_top_mlp:
            interactionsOutputInterleaved = False
        else:
            print("Using batch-interleaved format for top_mlp.")

        # Check if we should split the model into the binary file with embedding weights quantized and model without embeddings
        if not (os.path.isfile(self.embedding_weights_binary_filepath) and os.path.isfile(self.model_without_embedding_weights_filepath)):
            logging.info("Loading checkpoint from " + self.model_filepath)
            self.weights = torch.load(self.model_filepath, map_location="cpu")["state_dict"]
            self.dump_embedding_weights_to_binary_file()
            logging.info("Writing model without embedding weights to " + self.model_without_embedding_weights_filepath)
            torch.save(self.weights, self.model_without_embedding_weights_filepath)
            del self.weights

        # Dump row frequencies to file in binary format
        if self.use_row_frequencies and not os.path.isfile(self.row_frequencies_binary_filepath):
            logging.info("Writing row frequencies to " + self.row_frequencies_binary_filepath)
            self.dump_row_frequencies_to_binary_file()

        # Load weights
        self.weights = torch.load(self.model_without_embedding_weights_filepath, map_location="cpu")

        # Create network.
        self.network = self.builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

        # Numerical input
        numerical_input = self.network.add_input("numerical_input", trt.DataType.FLOAT, (-1, self.num_numerical_inputs, 1, 1))
        if not self.need_calibration:
            if self.input_dtype == "int8":
                numerical_input.dtype = trt.int8
            elif self.input_dtype == "fp16":
                numerical_input.dtype = trt.float16
            if self.input_format == "linear":
                numerical_input.allowed_formats = 1 << int(trt.TensorFormat.LINEAR)
            elif self.input_format == "chw4":
                numerical_input.allowed_formats = 1 << int(trt.TensorFormat.CHW4)
            elif self.input_format == "chw32":
                numerical_input.allowed_formats = 1 << int(trt.TensorFormat.CHW32)

        # Bottom MLP
        if self.need_calibration or self.input_dtype != "int8":
            bottom_mlp = self.add_mlp(numerical_input, self.num_numerical_inputs, self.bottom_mlp_channels, self.bottom_mlp_names,
                                      last_relu=True, useConvForFC=useConvForFC_bottom)
        else:
            bottom_mlp_plugin, output_tensor_name = self.add_fused_bottom_mlp("DLRM_BOTTOM_MLP_TRT", numerical_input, self.num_numerical_inputs, self.bottom_mlp_channels, self.bottom_mlp_names)
            bottom_mlp = self.network.add_plugin_v2([numerical_input], bottom_mlp_plugin)
            bottom_mlp.get_output(0).name = output_tensor_name
        bottom_mlp_shuffle = self.network.add_shuffle(bottom_mlp.get_output(0))
        bottom_mlp_shuffle.reshape_dims = trt.Dims((-1, 1, self.embedding_size))

        # Index input
        index_input = self.network.add_input("index_input", trt.DataType.INT32, (-1, self.num_features))

        # Embedding lookup and interactions
        dlrm_interactions_plugin = self.get_dlrm_interactions_plugin("DLRM_INTERACTIONS_TRT", np.cumsum(
            np.array([0] + self.embedding_rows[:-1]).astype(np.int32)).astype(np.int32), interactionsOutputInterleaved)
        interaction_output_concat = self.network.add_plugin_v2([bottom_mlp.get_output(0), index_input], dlrm_interactions_plugin)
        interaction_output_concat.name = "interaction_plugin"
        interaction_output_concat.get_output(0).name = "interaction_output_concat_output"

        if self.enable_interleaved_top_mlp and not interactionsOutputInterleaved:
            # Shuffle from [BS, C, 1, 1] to [BS//2, C, 2, 1] before top_mlp
            interleave_pre_top_mlp = self.network.add_shuffle(interaction_output_concat.get_output(0))
            interleave_pre_top_mlp.reshape_dims = trt.Dims((-1, 2, interaction_output_concat.get_output(0).shape[1], 0))
            interleave_pre_top_mlp.second_transpose = trt.Permutation([0, 2, 1, 3])
            interleave_pre_top_mlp.name = "interleave_pre_top_mlp"

            top_mlp_input = interleave_pre_top_mlp.get_output(0)
            top_mlp_input.name = "interleave_pre_top_mlp"
        else:
            top_mlp_input = interaction_output_concat.get_output(0)

        # Insert small-tile GEMM plugin. The plugin supports Ampere-only.
        gpu_arch = get_system().arch
        system_id = get_system().gpu
        if self.use_small_tile_gemm_plugin:
            if gpu_arch != Architecture.Ampere:
                print("Small-Tile GEMM plugin does not support {}. Plugin disabled.".format(system_id))
                self.use_small_tile_gemm_plugin = False

        # Enable gemm plugin with interleaved format is not recommended.
        # Note (2/7/21): GEMM plugin doesn't perform well when H*W > 1
        if self.use_small_tile_gemm_plugin and self.enable_interleaved_top_mlp:
            print("Warning: small-Tile GEMM plugin performance will be "
                  "significantly impacted by interleaved format. Turn off "
                  "interleaved format for the best performance")

        tmp_mlp_input = top_mlp_input
        tmp_input_size = self.top_mlp_input_size

        # Helper function to check whether the provided shape is supported by
        # Small-Tile GEMM plugin
        def support_small_tile_gemm_func(C, K): return \
            (C >= 256) and (C <= 1280) and (C % 128 == 0) and (K % 128 == 0)

        # Split the top_mlp layers, and use GEMM plugin for 2,4,6
        # C, K for top_mlp.0,2,4,6,8: [480,1024],[1024,1024],[1024,512],[512,256],[256,1]
        for i in range(len(self.top_mlp_channels)):
            # Insert plugin if the layer meets the restriction
            if support_small_tile_gemm_func(tmp_input_size, self.top_mlp_channels[i]) and \
                    self.use_small_tile_gemm_plugin:
                print("Replacing {} with Small-Tile GEMM Plugin, with fairshare cache size {}".
                      format(self.top_mlp_names[i], self.gemm_plugin_fairshare_cache_size))
                layer_top_mlp = self.add_small_tile_gemm_top_mlp(
                    tmp_mlp_input, tmp_input_size,
                    self.top_mlp_channels[i], self.top_mlp_names[i],
                    self.gemm_plugin_fairshare_cache_size
                )
            else:
                layer_top_mlp = self.add_single_mlp(
                    tmp_mlp_input, tmp_input_size,
                    self.top_mlp_channels[i], self.top_mlp_names[i],
                    useConvForFC=useConvForFC_top,
                    add_relu=(i != len(self.top_mlp_channels) - 1))

            tmp_mlp_input = layer_top_mlp.get_output(0)
            tmp_input_size = self.top_mlp_channels[i]

        top_mlp = layer_top_mlp

        if self.enable_interleaved_top_mlp:
            # Shuffle [BS//2, 1, 2, 1] back to [BS, 1, 1, 1]
            interleave_post_top_mlp = self.network.add_shuffle(top_mlp.get_output(0))
            interleave_post_top_mlp.reshape_dims = trt.Dims((-1, 0, 1, 0))
            interleave_post_top_mlp.name = "interleave_post_top_mlp"

            sigmoid_input = interleave_post_top_mlp.get_output(0)
            sigmoid_input.name = "interleave_post_top_mlp"
        else:
            sigmoid_input = top_mlp.get_output(0)

        # Sigmoid
        sigmoid_layer = self.network.add_activation(sigmoid_input, trt.ActivationType.SIGMOID)
        sigmoid_layer.name = "sigmoid"
        sigmoid_layer.get_output(0).name = "sigmoid_output"

        # Output
        self.network.mark_output(sigmoid_layer.get_output(0))

        # Make sure we release the memory to system
        del self.weights

        self.initialized = True