コード例 #1
0
def lower_modules_to_accelerator(model: nn.Module,
                                 trace,
                                 export_options: ExportConfig,
                                 throughput_optimize=False):
    # Raise error if accelerator could not be imported
    if not accelerator_lowering_supported:
        raise RuntimeError("Accelerator Lowering not supported!")

    import torch_glow

    log_accelerator_feature_usage("build.NNPI")
    if ((hasattr(model, "encoder")
         and isinstance(model.encoder, RoBERTaEncoder))
            or (hasattr(model, "representation")
                and isinstance(model.representation, AcceleratorBiLSTM)) or
        (hasattr(model, "lower_module")
         # Internal CNN LM module to add accelerator support.
         and type(model.lower_module).__qualname__ == "CNNLowerModule")):
        backend = "NNPI"
        (
            submod_modelpath,
            compilation_spec_dict,
            inputs_function,
        ) = accelerator.get_modules(model, backend)[0]
        submod_tracepath = accelerator.model2trace_path(submod_modelpath)
        spec = torch_glow.CompilationSpec()
        spec.get_settings().set_glow_backend(backend)
        compilation_group = torch_glow.CompilationGroup()
        spec.compilation_groups_append(compilation_group)
        compilation_group_settings = compilation_group.get_settings()
        compilation_group_settings.set_convert_to_fp16(True)

        # Override the options for throughput-optimized case
        if throughput_optimize:
            compilation_spec_dict["NNPI_IceCores"] = "4"
            compilation_spec_dict["NNPINumParallelChunks"] = "4"
            compilation_group_settings.set_replication_count(3)

        for k, v in compilation_spec_dict.items():
            compilation_group.get_settings().backend_specific_opts_insert(k, v)

        if inputs_function is not None:
            input_sets = inputs_function(model, trace, export_options, None,
                                         submod_modelpath)
        else:
            raise RuntimeError(
                "inputs_function needs to be specified in accelerator decorator"
            )
        compilation_group.set_input_sets(input_sets)

        trace = torch_glow.to_glow_selective(
            trace,
            {submod_tracepath: spec},
            inplace=False,
        )

        return trace
    else:
        return trace
コード例 #2
0
ファイル: quantize.py プロジェクト: raybrshen/pytext
def quantize_statically(model,
                        inputs,
                        data_loader,
                        linear_only=False,
                        module_swap=False):
    log_feature_usage("export.quantize.statically")
    if (hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder)
            and linear_only):
        log_accelerator_feature_usage("quantize.statically")
        qconfig = QConfig(
            activation=HistogramObserver.with_args(reduce_range=False),
            weight=default_weight_observer,
        )
        qconfig_dict = {"": None}
        if module_swap:
            layers = model.encoder.encoder.transformer.layers.layers
            layers_str = "encoder.encoder.transformer.layers.layers"
        else:
            layers = model.encoder.encoder.transformer.layers
            layers_str = "encoder.encoder.transformer.layers"

        # skip first layer
        for layer_idx in range(1, len(layers)):
            qconfig_dict[
                layers_str +
                ".{}.attention.input_projection".format(layer_idx)] = qconfig
            qconfig_dict[
                layers_str +
                ".{}.attention.output_projection".format(layer_idx)] = qconfig
            for mlp_idx, m in enumerate(layers[layer_idx].residual_mlp.mlp):
                # Only quantize first linear otherwise there are accuarcy issues
                if type(m) == torch.nn.Linear and mlp_idx < 1:
                    qconfig_dict[layers_str + ".{}.residual_mlp.mlp.{}".format(
                        layer_idx, mlp_idx)] = qconfig
        trace = model.graph_mode_quantize(inputs,
                                          data_loader,
                                          qconfig_dict=qconfig_dict,
                                          force_quantize=True)
    else:
        trace = model.graph_mode_quantize(inputs, data_loader)

    return trace
コード例 #3
0
def lower_modules_to_accelerator(model: nn.Module, trace,
                                 export_options: ExportConfig):
    import torch_glow

    log_accelerator_feature_usage("build.NNPI")
    if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder):
        backend = "NNPI"
        (
            submod_modelpath,
            compilation_spec_dict,
            inputs_function,
        ) = accelerator.get_modules(model, backend)[0]
        submod_tracepath = accelerator.model2trace_path(submod_modelpath)
        spec = torch_glow.CompilationSpec()
        spec.get_settings().set_glow_backend(backend)
        compilation_group = torch_glow.CompilationGroup()
        spec.compilation_groups_append(compilation_group)
        compilation_group_settings = compilation_group.get_settings()
        compilation_group_settings.set_convert_to_fp16(True)
        for k, v in compilation_spec_dict.items():
            compilation_group.get_settings().backend_specific_opts_insert(k, v)

        if inputs_function is not None:
            input_sets = inputs_function(model, trace, export_options, None,
                                         submod_modelpath)
        compilation_group.set_input_sets(input_sets)

        trace = torch_glow.to_glow_selective(
            trace,
            {submod_tracepath: spec},
            inplace=False,
        )

        return trace
    else:
        return trace
コード例 #4
0
ファイル: new_task.py プロジェクト: wwjiang007/pytext
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # TODO(T88310041) Remove These
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(False)
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control
        accel = AccelerateOptions(export_config.accelerate)
        print(f"Using accelerate options: {accel.__dict__}")

        # what hosts can this model run on
        # by default, pytext works on CPU and CUDA (because it implements set_device)
        model_host = ["cpu", "cuda"]
        if accel.use_cuda:
            # CUDA FP16 models only work on CUDA
            model_host = ["cuda"]
        if accel.use_nnpi:
            model_host = ["nnpi"]
        if hasattr(model, "set_host"):
            model.set_host(model_host)

        # what is the type of this model
        # pytext models are nlp models
        model_type = ["nlp"]

        instance_paths_p = any(
            True for _ in find_module_instances(model, RoBERTaEncoder, []))
        if instance_paths_p:
            model_type.append("transformer")

        instance_paths_p = any(
            True for _ in find_module_instances(model, BiLSTM, []))
        if instance_paths_p:
            model_type.append("BiLSTM")

        if hasattr(model, "set_model"):
            model.set_type(model_type)

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        model = rewrite_nnpi_modules(model, accel)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if accel.use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                accel.use_nnpi_fx_dynamic_quantize
                or accel.use_cpu_fx_dynamic_quantize,
                accel.use_nnpi_fx_static_selectively_quantize
                or accel.use_cpu_fx_static_selectively_quantize,
            )
        elif (quantize or accel.use_nnpi_quantize) and hasattr(
                model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = accel.use_nnpi_quantize
            module_swap = accel.use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if accel.use_cuda and (accel.use_cuda_half_ft
                                   or accel.use_cuda_dq):
                log_accelerator_feature_usage(
                    "build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True
                if accel.use_cuda_dq:
                    model = swap_modules(model, MODULE_TO_REWRITER["cuda-dq"])
                else:
                    model = swap_modules(model, MODULE_TO_REWRITER["cuda"])
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("Traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                results = trace(*inputs)
                assert results
            else:
                trace = model.trace(inputs)
                print("Traced!")
                if accel.use_cuda and accel.use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True)))
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if accel.use_nnpi and not accel.use_nnpi_split:
            print("Lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model,
                trace,
                export_config,
                accel.use_nnpi_throughput_optimized,
                accel.use_nnpi_throughput_maximized,
                accel.use_nnpi_gelu_clip,
            )
        if accel.use_nnpi_split:
            print("Lowering split model to Glow")
            trace = lower_split_model_to_accelerator(model, trace,
                                                     export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
コード例 #5
0
ファイル: new_task.py プロジェクト: prabhat00155/pytext
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control
        inference_interface = export_config.inference_interface

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if "nnpi" in accelerate:
            model = swap_modules_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True))
        )
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        use_cuda_half = "cuda:half" in accelerate

        if quantize and hasattr(model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            quantize_linear_only = "nnpi_quantize" in accelerate
            module_swap = "nnpi" in accelerate
            trace = quantize_statically(
                model, inputs, data_loader, quantize_linear_only, module_swap
            )
        else:
            if quantize:
                log_accelerator_feature_usage("quantize.dynamically.CPU")
                model.quantize()

            trace = model.trace(inputs)
            print("traced!")

            if use_cuda_half:
                log_accelerator_feature_usage("build.CUDA.half")
                # convert trace to half precision
                trace.cuda().half()

                #### trace test: demonstrate that it is usable
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True))
                )
                inputs = model.onnx_trace_input(batch)
                assert trace(*inputs)
                #### end of trace test

        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length", seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length", batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        if inference_interface is not None:
            if hasattr(trace, "inference_interface"):
                trace.inference_interface(inference_interface)
            else:
                print(
                    "inference_interface not supported by model. Ignoring inference_interface"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if "nnpi" in accelerate:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(model, trace, export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
コード例 #6
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control

        # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI
        use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate)
        use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate
        use_cuda_half = "cuda:half" in accelerate
        use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate

        use_nnpi_quantize = "nnpi:quantize" in accelerate
        use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate
        use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate
        use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate
        use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate
        use_fx_quantize = (use_nnpi_fx_static_quantize
                           or use_nnpi_fx_dynamic_quantize
                           or use_cpu_fx_static_quantize
                           or use_cpu_fx_dynamic_quantize)

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if use_nnpi or use_fx_quantize:
            model = swap_modules_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize,
            )
        elif (quantize or use_nnpi_quantize) and hasattr(
                model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize"
                                                         in accelerate)
            module_swap = use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if use_cuda_half_faster_transformers:
                log_accelerator_feature_usage(
                    "build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True

                model = swap_modules_for_faster_transformer(model)
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                assert trace(*inputs)
            else:
                trace = model.trace(inputs)
                print("traced!")
                if use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True)))
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if use_nnpi:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model, trace, export_config, use_nnpi_throughput_optimized)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
コード例 #7
0
def lower_modules_to_accelerator(
    model: nn.Module, trace, export_options: ExportConfig, throughput_optimize=False
):
    # Raise error if accelerator could not be imported
    if not accelerator_lowering_supported:
        raise RuntimeError("Accelerator Lowering not supported!")

    import torch_glow

    log_accelerator_feature_usage("build.NNPI")
    if (
        (hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder))
        or (
            hasattr(model, "representation")
            and isinstance(model.representation, AcceleratorBiLSTM)
        )
        or (
            hasattr(model, "lower_module")
            # Internal CNN LM module to add accelerator support.
            and type(model.lower_module).__qualname__ == "CNNLowerModule"
        )
    ):
        backend = "NNPI"
        backend_qualifier = ""

        if throughput_optimize:
            backend_qualifier = ":throughput_optimized"

        modules_to_lower = accelerator.get_modules(model, backend + backend_qualifier)

        if len(modules_to_lower) < 1:
            raise RuntimeError("Need at least one module to lower to accelerator")
        elif len(modules_to_lower) > 1:
            print(f"Warning. Received {len(modules_to_lower)} modules to lower.")
            print("Warning. Only lowering first module.")

        (
            submod_modelpath,
            compilation_spec_dict,
            inputs_function,
        ) = modules_to_lower[0]
        submod_tracepath = accelerator.model2trace_path(submod_modelpath)
        spec = torch_glow.CompilationSpec()
        spec.get_settings().set_glow_backend(backend)
        compilation_group = torch_glow.CompilationGroup()
        spec.compilation_groups_append(compilation_group)
        compilation_group_settings = compilation_group.get_settings()

        # Set values from dict that are not set via backend-specific opts
        compilation_group_settings.set_convert_to_fp16(
            compilation_spec_dict.pop("glow:ConvertToFP16", "true") in ["true", "True"]
        )
        compilation_group_settings.set_replication_count(
            int(compilation_spec_dict.pop("glow:ReplicationCount", "1"))
        )

        for k, v in compilation_spec_dict.items():
            compilation_group.get_settings().backend_specific_opts_insert(k, v)

        if inputs_function is not None:
            input_sets = inputs_function(
                model, trace, export_options, None, submod_modelpath
            )
        else:
            raise RuntimeError(
                "inputs_function needs to be specified in accelerator decorator"
            )
        compilation_group.set_input_sets(input_sets)

        trace = torch_glow.to_glow_selective(
            trace,
            {submod_tracepath: spec},
            inplace=False,
        )

        return trace
    else:
        return trace